Merge branch 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block

author Linus Torvalds <[email protected]>

Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)

committer Linus Torvalds <[email protected]>

Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)
author Linus Torvalds <[email protected]>
Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)
committer Linus Torvalds <[email protected]>
Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)
diff --combined block/Makefile

index c850d5ef80a22eccd057d76c0d8acab6f5f4d953,f627e4b1a9da3e04f9da21119980f1a12ef08e42..0fec4b3fab511bc065261121f279e6f64038c35c
--- 1/block/Makefile
--- 2/block/Makefile
+++ b/block/Makefile
@@@ -3,13 -3,12 +3,13 @@@
   #
   
   obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
-                       blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
+                       blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                         blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                         blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
   
   obj-$(CONFIG_BLK_DEV_BSG)     += bsg.o
   obj-$(CONFIG_BLK_CGROUP)      += blk-cgroup.o
+ +obj-$(CONFIG_BLK_DEV_THROTTLING)      += blk-throttle.o
   obj-$(CONFIG_IOSCHED_NOOP)    += noop-iosched.o
   obj-$(CONFIG_IOSCHED_DEADLINE)        += deadline-iosched.o
   obj-$(CONFIG_IOSCHED_CFQ)     += cfq-iosched.o
diff --combined block/blk-core.c

index 500eb859886e7ec989b2b1dc731eccc8d58dc85b,a840523e3b409d74c075ddc8cb8c5e5a0383dede..45141469e89eba5f24c9594afdfa5218e5688f46
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -64,15 -64,13 +64,15 @@@ static void drive_stat_acct(struct requ
                 return;
   
         cpu = part_stat_lock();
- -      part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
   
- -      if (!new_io)
+ +      if (!new_io) {
+ +              part = rq->part;
                 part_stat_inc(cpu, part, merges[rw]);
- -      else {
+ +      } else {
+ +              part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
                 part_round_stats(cpu, part);
                 part_inc_in_flight(part, rw);
+ +              rq->part = part;
         }
   
         part_stat_unlock();
@@@ -130,7 -128,6 +130,7 @@@ void blk_rq_init(struct request_queue *
         rq->ref_count = 1;
         rq->start_time = jiffies;
         set_start_time_ns(rq);
+ +      rq->part = NULL;
   }
   EXPORT_SYMBOL(blk_rq_init);
   
@@@ -139,7 -136,7 +139,7 @@@ static void req_bio_endio(struct reques
   {
         struct request_queue *q = rq->q;
   
-       if (&q->bar_rq != rq) {
+       if (&q->flush_rq != rq) {
                 if (error)
                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
                 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@@ -163,13 -160,12 +163,12 @@@
                 if (bio->bi_size == 0)
                         bio_endio(bio, error);
         } else {
- 
                 /*
-                * Okay, this is the barrier request in progress, just
-                * record the error;
+                * Okay, this is the sequenced flush request in
+                * progress, just record the error;
                  */
-               if (error && !q->orderr)
-                       q->orderr = error;
+               if (error && !q->flush_err)
+                       q->flush_err = error;
         }
   }
   
@@@ -385,7 -381,6 +384,7 @@@ void blk_sync_queue(struct request_queu
         del_timer_sync(&q->unplug_timer);
         del_timer_sync(&q->timeout);
         cancel_work_sync(&q->unplug_work);
+ +      throtl_shutdown_timer_wq(q);
   }
   EXPORT_SYMBOL(blk_sync_queue);
   
@@@ -463,8 -458,6 +462,8 @@@ void blk_cleanup_queue(struct request_q
         if (q->elevator)
                 elevator_exit(q->elevator);
   
+ +      blk_throtl_exit(q);
+ +
         blk_put_queue(q);
   }
   EXPORT_SYMBOL(blk_cleanup_queue);
@@@ -521,16 -514,12 +520,17 @@@ struct request_queue *blk_alloc_queue_n
                 return NULL;
         }
   
+ +      if (blk_throtl_init(q)) {
+ +              kmem_cache_free(blk_requestq_cachep, q);
+ +              return NULL;
+ +      }
+ +
         setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                     laptop_mode_timer_fn, (unsigned long) q);
         init_timer(&q->unplug_timer);
         setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
         INIT_LIST_HEAD(&q->timeout_list);
+       INIT_LIST_HEAD(&q->pending_flushes);
         INIT_WORK(&q->unplug_work, blk_unplug_work);
   
         kobject_init(&q->kobj, &blk_queue_ktype);
@@@ -807,16 -796,11 +807,16 @@@ static struct request *get_request(stru
         rl->starved[is_sync] = 0;
   
         priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- -      if (priv)
+ +      if (priv) {
                 rl->elvpriv++;
   
- -      if (blk_queue_io_stat(q))
- -              rw_flags |= REQ_IO_STAT;
+ +              /*
+ +               * Don't do stats for non-priv requests
+ +               */
+ +              if (blk_queue_io_stat(q))
+ +                      rw_flags |= REQ_IO_STAT;
+ +      }
+ +
         spin_unlock_irq(q->queue_lock);
   
         rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@@ -1053,22 -1037,6 +1053,6 @@@ void blk_insert_request(struct request_
   }
   EXPORT_SYMBOL(blk_insert_request);
   
- /*
-  * add-request adds a request to the linked list.
-  * queue lock is held and interrupts disabled, as we muck with the
-  * request queue list.
-  */
- static inline void add_request(struct request_queue *q, struct request *req)
- {
-       drive_stat_acct(req, 1);
- 
-       /*
-        * elevator indicated where it wants this request to be
-        * inserted at elevator_merge time
-        */
-       __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
- }
- 
   static void part_round_stats_single(int cpu, struct hd_struct *part,
                                     unsigned long now)
   {
@@@ -1217,13 -1185,16 +1201,16 @@@ static int __make_request(struct reques
         const bool sync = !!(bio->bi_rw & REQ_SYNC);
         const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
         const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
+       int where = ELEVATOR_INSERT_SORT;
         int rw_flags;
   
-       if ((bio->bi_rw & REQ_HARDBARRIER) &&
-           (q->next_ordered == QUEUE_ORDERED_NONE)) {
+       /* REQ_HARDBARRIER is no more */
+       if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
+               "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
                 bio_endio(bio, -EOPNOTSUPP);
                 return 0;
         }
+ 
         /*
          * low level driver can indicate that it wants pages above a
          * certain limit bounced to low memory (ie for highmem, or even
@@@ -1233,7 -1204,12 +1220,12 @@@
   
         spin_lock_irq(q->queue_lock);
   
-       if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+               where = ELEVATOR_INSERT_FRONT;
+               goto get_rq;
+       }
+ 
+       if (elv_queue_empty(q))
                 goto get_rq;
   
         el_ret = elv_merge(q, &req, bio);
@@@ -1330,7 -1306,10 +1322,10 @@@ get_rq
                 req->cpu = blk_cpu_to_group(smp_processor_id());
         if (queue_should_plug(q) && elv_queue_empty(q))
                 blk_plug_device(q);
-       add_request(q, req);
+ 
+       /* insert the request into the elevator */
+       drive_stat_acct(req, 1);
+       __elv_add_request(q, req, where, 0);
   out:
         if (unplug || !queue_should_plug(q))
                 __generic_unplug_device(q);
@@@ -1530,6 -1509,19 +1525,19 @@@ static inline void __generic_make_reque
                 if (bio_check_eod(bio, nr_sectors))
                         goto end_io;
   
+               /*
+                * Filter flush bio's early so that make_request based
+                * drivers without flush support don't have to worry
+                * about them.
+                */
+               if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+                       bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+                       if (!nr_sectors) {
+                               err = 0;
+                               goto end_io;
+                       }
+               }
+ 
                 if ((bio->bi_rw & REQ_DISCARD) &&
                     (!blk_queue_discard(q) ||
                      ((bio->bi_rw & REQ_SECURE) &&
@@@ -1538,15 -1530,6 +1546,15 @@@
                         goto end_io;
                 }
   
+ +              blk_throtl_bio(q, &bio);
+ +
+ +              /*
+ +               * If bio = NULL, bio has been throttled and will be submitted
+ +               * later.
+ +               */
+ +              if (!bio)
+ +                      break;
+ +
                 trace_block_bio_queue(q, bio);
   
                 ret = q->make_request_fn(q, bio);
@@@ -1637,12 -1620,11 +1645,12 @@@ void submit_bio(int rw, struct bio *bio
   
                 if (unlikely(block_dump)) {
                         char b[BDEVNAME_SIZE];
- -                      printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+ +                      printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
                         current->comm, task_pid_nr(current),
                                 (rw & WRITE) ? "WRITE" : "READ",
                                 (unsigned long long)bio->bi_sector,
- -                              bdevname(bio->bi_bdev, b));
+ +                              bdevname(bio->bi_bdev, b),
+ +                              count);
                 }
         }
   
@@@ -1785,7 -1767,7 +1793,7 @@@ static void blk_account_io_completion(s
                 int cpu;
   
                 cpu = part_stat_lock();
- -              part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ +              part = req->part;
                 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                 part_stat_unlock();
         }
@@@ -1794,18 -1776,18 +1802,18 @@@
   static void blk_account_io_done(struct request *req)
   {
         /*
-        * Account IO completion.  bar_rq isn't accounted as a normal
-        * IO on queueing nor completion.  Accounting the containing
-        * request is enough.
+        * Account IO completion.  flush_rq isn't accounted as a
+        * normal IO on queueing nor completion.  Accounting the
+        * containing request is enough.
          */
-       if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+       if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
                 unsigned long duration = jiffies - req->start_time;
                 const int rw = rq_data_dir(req);
                 struct hd_struct *part;
                 int cpu;
   
                 cpu = part_stat_lock();
- -              part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ +              part = req->part;
   
                 part_stat_inc(cpu, part, ios[rw]);
                 part_stat_add(cpu, part, ticks[rw], duration);
@@@ -2523,9 -2505,7 +2531,7 @@@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone)
   static void __blk_rq_prep_clone(struct request *dst, struct request *src)
   {
         dst->cpu = src->cpu;
-       dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
-       if (src->cmd_flags & REQ_DISCARD)
-               dst->cmd_flags |= REQ_DISCARD;
+       dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
         dst->cmd_type = src->cmd_type;
         dst->__sector = blk_rq_pos(src);
         dst->__data_len = blk_rq_bytes(src);
@@@ -2605,13 -2585,6 +2611,13 @@@ int kblockd_schedule_work(struct reques
   }
   EXPORT_SYMBOL(kblockd_schedule_work);
   
+ +int kblockd_schedule_delayed_work(struct request_queue *q,
+ +                      struct delayed_work *dwork, unsigned long delay)
+ +{
+ +      return queue_delayed_work(kblockd_workqueue, dwork, delay);
+ +}
+ +EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+ +
   int __init blk_dev_init(void)
   {
         BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --combined block/blk-settings.c

index 315b88c8cbbbbc5e6cefd00361d988a4c0b58a8a,9b18afcfe9257145ed9ec7abeb1298929f854ec4..701859fb9647c31a505f0218800744e3e6d0a775
--- 1/block/blk-settings.c
--- 2/block/blk-settings.c
+++ b/block/blk-settings.c
@@@ -111,7 -111,6 +111,7 @@@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy)
   void blk_set_default_limits(struct queue_limits *lim)
   {
         lim->max_segments = BLK_MAX_SEGMENTS;
+ +      lim->max_integrity_segments = 0;
         lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
         lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
         lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@@ -214,7 -213,7 +214,7 @@@ void blk_queue_bounce_limit(struct requ
          */
         if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
                 dma = 1;
- -      q->limits.bounce_pfn = max_low_pfn;
+ +      q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
   #else
         if (b_pfn < blk_max_low_pfn)
                 dma = 1;
@@@ -344,7 -343,7 +344,7 @@@ EXPORT_SYMBOL(blk_queue_logical_block_s
    *   hardware can operate on without reverting to read-modify-write
    *   operations.
    */
- -void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+ +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
   {
         q->limits.physical_block_size = size;
   
@@@ -456,6 -455,11 +456,6 @@@ void blk_queue_io_opt(struct request_qu
   }
   EXPORT_SYMBOL(blk_queue_io_opt);
   
- -/*
- - * Returns the minimum that is _not_ zero, unless both are zero.
- - */
- -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
- -
   /**
    * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
    * @t:        the stacking driver (top)
@@@ -510,8 -514,6 +510,8 @@@ int blk_stack_limits(struct queue_limit
                                             b->seg_boundary_mask);
   
         t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+ +      t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+ +                                               b->max_integrity_segments);
   
         t->max_segment_size = min_not_zero(t->max_segment_size,
                                            b->max_segment_size);
@@@ -792,6 -794,26 +792,26 @@@ void blk_queue_update_dma_alignment(str
   }
   EXPORT_SYMBOL(blk_queue_update_dma_alignment);
   
+ /**
+  * blk_queue_flush - configure queue's cache flush capability
+  * @q:                the request queue for the device
+  * @flush:    0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+  *
+  * Tell block layer cache flush capability of @q.  If it supports
+  * flushing, REQ_FLUSH should be set.  If it supports bypassing
+  * write cache for individual writes, REQ_FUA should be set.
+  */
+ void blk_queue_flush(struct request_queue *q, unsigned int flush)
+ {
+       WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+ 
+       if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+               flush &= ~REQ_FUA;
+ 
+       q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+ }
+ EXPORT_SYMBOL_GPL(blk_queue_flush);
+ 
   static int __init blk_settings_init(void)
   {
         blk_max_low_pfn = max_low_pfn - 1;
diff --combined block/blk.h

index f864012ec300d632d34c96ae5d0a2e991873cf8c,faf94f2acb12d0b6d72f8ba6e172c4fbc48ad831..1e675e5ade02615ee5eba4293a6f6b07cbf3c9c1
--- 1/block/blk.h
--- 2/block/blk.h
+++ b/block/blk.h
@@@ -51,6 -51,8 +51,8 @@@ static inline void blk_clear_rq_complet
    */
   #define ELV_ON_HASH(rq)               (!hlist_unhashed(&(rq)->hash))
   
+ struct request *blk_do_flush(struct request_queue *q, struct request *rq);
+ 
   static inline struct request *__elv_next_request(struct request_queue *q)
   {
         struct request *rq;
@@@ -58,7 -60,11 +60,11 @@@
         while (1) {
                 while (!list_empty(&q->queue_head)) {
                         rq = list_entry_rq(q->queue_head.next);
-                       if (blk_do_ordered(q, &rq))
+                       if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+                           rq == &q->flush_rq)
+                               return rq;
+                       rq = blk_do_flush(q, rq);
+                       if (rq)
                                 return rq;
                 }
   
@@@ -110,6 -116,10 +116,6 @@@ void blk_queue_congestion_threshold(str
   
   int blk_dev_init(void);
   
- -void elv_quiesce_start(struct request_queue *q);
- -void elv_quiesce_end(struct request_queue *q);
- -
- -
   /*
    * Return the threshold (number of used requests) at which the queue is
    * considered to be congested.  It include a little hysteresis to keep the
@@@ -128,6 -138,14 +134,6 @@@ static inline int queue_congestion_off_
         return q->nr_congestion_off;
   }
   
- -#if defined(CONFIG_BLK_DEV_INTEGRITY)
- -
- -#define rq_for_each_integrity_segment(bvl, _rq, _iter)                \
- -      __rq_for_each_bio(_iter.bio, _rq)                       \
- -              bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
- -
- -#endif /* BLK_DEV_INTEGRITY */
- -
   static inline int blk_cpu_to_group(int cpu)
   {
         int group = NR_CPUS;
diff --combined block/ioctl.c

index 2c15fe0912c4c4287b0fbde0c08933d6ad27aa64,cb2b9099862be9c526972b3772235ec54b1fb4e2..d724ceb1d46535fee2fa3e65ca57cb09cb88997d
--- 1/block/ioctl.c
--- 2/block/ioctl.c
+++ b/block/ioctl.c
@@@ -62,7 -62,7 +62,7 @@@ static int blkpg_ioctl(struct block_dev
   
                         /* all seems OK */
                         part = add_partition(disk, partno, start, length,
- -                                           ADDPART_FLAG_NONE);
+ +                                           ADDPART_FLAG_NONE, NULL);
                         mutex_unlock(&bdev->bd_mutex);
                         return IS_ERR(part) ? PTR_ERR(part) : 0;
                 case BLKPG_DEL_PARTITION:
@@@ -116,7 -116,7 +116,7 @@@ static int blkdev_reread_part(struct bl
   static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
                              uint64_t len, int secure)
   {
-       unsigned long flags = BLKDEV_IFL_WAIT;
+       unsigned long flags = 0;
   
         if (start & 511)
                 return -EINVAL;
@@@ -128,7 -128,7 +128,7 @@@
         if (start + len > (bdev->bd_inode->i_size >> 9))
                 return -EINVAL;
         if (secure)
-               flags |= BLKDEV_IFL_SECURE;
+               flags |= BLKDEV_DISCARD_SECURE;
         return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
   }
   
diff --combined drivers/block/brd.c

index 82bfd5bb4a973f316de41fb6fd05bd41226c322a,fa33f97722babc52ca0a9405100e5a8dfc899ca2..b7f51e4594f8660f0a54472d264d957dbe533312
--- 1/drivers/block/brd.c
--- 2/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@@ -15,7 -15,7 +15,7 @@@
   #include <linux/blkdev.h>
   #include <linux/bio.h>
   #include <linux/highmem.h>
- -#include <linux/smp_lock.h>
+ +#include <linux/mutex.h>
   #include <linux/radix-tree.h>
   #include <linux/buffer_head.h> /* invalidate_bh_lrus() */
   #include <linux/slab.h>
@@@ -55,7 -55,6 +55,7 @@@ struct brd_device 
   /*
    * Look up and return a brd's page for a given sector.
    */
+ +static DEFINE_MUTEX(brd_mutex);
   static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
   {
         pgoff_t idx;
@@@ -403,7 -402,7 +403,7 @@@ static int brd_ioctl(struct block_devic
          * ram device BLKFLSBUF has special semantics, we want to actually
          * release and destroy the ramdisk data.
          */
- -      lock_kernel();
+ +      mutex_lock(&brd_mutex);
         mutex_lock(&bdev->bd_mutex);
         error = -EBUSY;
         if (bdev->bd_openers <= 1) {
@@@ -420,7 -419,7 +420,7 @@@
                 error = 0;
         }
         mutex_unlock(&bdev->bd_mutex);
- -      unlock_kernel();
+ +      mutex_unlock(&brd_mutex);
   
         return error;
   }
@@@ -483,7 -482,6 +483,6 @@@ static struct brd_device *brd_alloc(in
         if (!brd->brd_queue)
                 goto out_free_dev;
         blk_queue_make_request(brd->brd_queue, brd_make_request);
-       blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
         blk_queue_max_hw_sectors(brd->brd_queue, 1024);
         blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
   
diff --combined drivers/block/drbd/drbd_int.h

index c07c370c4c822d53614574b3256a2a922cb1e4c0,c2ef476f57119f87e58b0181539bd985de2b2f22..9bdcf4393c0aa9525c9c7918355cb9b985809a73
--- 1/drivers/block/drbd/drbd_int.h
--- 2/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@@ -337,25 -337,13 +337,25 @@@ static inline void bm_xfer_ctx_bit_to_w
    * NOTE that the payload starts at a long aligned offset,
    * regardless of 32 or 64 bit arch!
    */
- -struct p_header {
+ +struct p_header80 {
         u32       magic;
         u16       command;
         u16       length;       /* bytes of data after this header */
         u8        payload[0];
   } __packed;
- -/* 8 bytes. packet FIXED for the next century! */
+ +
+ +/* Header for big packets, Used for data packets exceeding 64kB */
+ +struct p_header95 {
+ +      u16       magic;        /* use DRBD_MAGIC_BIG here */
+ +      u16       command;
+ +      u32       length;       /* Use only 24 bits of that. Ignore the highest 8 bit. */
+ +      u8        payload[0];
+ +} __packed;
+ +
+ +union p_header {
+ +      struct p_header80 h80;
+ +      struct p_header95 h95;
+ +};
   
   /*
    * short commands, packets without payload, plain p_header:
@@@ -374,16 -362,12 +374,16 @@@
    */
   
   /* these defines must not be changed without changing the protocol version */
- -#define DP_HARDBARRIER              1
- -#define DP_RW_SYNC          2
+ +#define DP_HARDBARRIER              1 /* depricated */
+ +#define DP_RW_SYNC          2 /* equals REQ_SYNC    */
   #define DP_MAY_SET_IN_SYNC    4
+ +#define DP_UNPLUG             8 /* equals REQ_UNPLUG  */
+ +#define DP_FUA               16 /* equals REQ_FUA     */
+ +#define DP_FLUSH             32 /* equals REQ_FLUSH   */
+ +#define DP_DISCARD           64 /* equals REQ_DISCARD */
   
   struct p_data {
- -      struct p_header head;
+ +      union p_header head;
         u64         sector;    /* 64 bits sector number */
         u64         block_id;  /* to identify the request in protocol B&C */
         u32         seq_num;
@@@ -399,7 -383,7 +399,7 @@@
    *   P_DATA_REQUEST, P_RS_DATA_REQUEST
    */
   struct p_block_ack {
- -      struct p_header head;
+ +      struct p_header80 head;
         u64         sector;
         u64         block_id;
         u32         blksize;
@@@ -408,7 -392,7 +408,7 @@@
   
   
   struct p_block_req {
- -      struct p_header head;
+ +      struct p_header80 head;
         u64 sector;
         u64 block_id;
         u32 blksize;
@@@ -425,7 -409,7 +425,7 @@@
    */
   
   struct p_handshake {
- -      struct p_header head;   /* 8 bytes */
+ +      struct p_header80 head; /* 8 bytes */
         u32 protocol_min;
         u32 feature_flags;
         u32 protocol_max;
@@@ -440,19 -424,19 +440,19 @@@
   /* 80 bytes, FIXED for the next century */
   
   struct p_barrier {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32 barrier;    /* barrier number _handle_ only */
         u32 pad;        /* to multiple of 8 Byte */
   } __packed;
   
   struct p_barrier_ack {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32 barrier;
         u32 set_size;
   } __packed;
   
   struct p_rs_param {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32 rate;
   
               /* Since protocol version 88 and higher. */
@@@ -460,31 -444,20 +460,31 @@@
   } __packed;
   
   struct p_rs_param_89 {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32 rate;
           /* protocol version 89: */
         char verify_alg[SHARED_SECRET_MAX];
         char csums_alg[SHARED_SECRET_MAX];
   } __packed;
   
+ +struct p_rs_param_95 {
+ +      struct p_header80 head;
+ +      u32 rate;
+ +      char verify_alg[SHARED_SECRET_MAX];
+ +      char csums_alg[SHARED_SECRET_MAX];
+ +      u32 c_plan_ahead;
+ +      u32 c_delay_target;
+ +      u32 c_fill_target;
+ +      u32 c_max_rate;
+ +} __packed;
+ +
   enum drbd_conn_flags {
         CF_WANT_LOSE = 1,
         CF_DRY_RUN = 2,
   };
   
   struct p_protocol {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32 protocol;
         u32 after_sb_0p;
         u32 after_sb_1p;
@@@ -498,17 -471,17 +498,17 @@@
   } __packed;
   
   struct p_uuids {
- -      struct p_header head;
+ +      struct p_header80 head;
         u64 uuid[UI_EXTENDED_SIZE];
   } __packed;
   
   struct p_rs_uuid {
- -      struct p_header head;
+ +      struct p_header80 head;
         u64         uuid;
   } __packed;
   
   struct p_sizes {
- -      struct p_header head;
+ +      struct p_header80 head;
         u64         d_size;  /* size of disk */
         u64         u_size;  /* user requested size */
         u64         c_size;  /* current exported size */
@@@ -518,18 -491,18 +518,18 @@@
   } __packed;
   
   struct p_state {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32         state;
   } __packed;
   
   struct p_req_state {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32         mask;
         u32         val;
   } __packed;
   
   struct p_req_state_reply {
- -      struct p_header head;
+ +      struct p_header80 head;
         u32         retcode;
   } __packed;
   
@@@ -544,7 -517,7 +544,7 @@@ struct p_drbd06_param 
   } __packed;
   
   struct p_discard {
- -      struct p_header head;
+ +      struct p_header80 head;
         u64         block_id;
         u32         seq_num;
         u32         pad;
@@@ -560,7 -533,7 +560,7 @@@ enum drbd_bitmap_code 
   };
   
   struct p_compressed_bm {
- -      struct p_header head;
+ +      struct p_header80 head;
         /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
          * (encoding & 0x80): polarity (set/unset) of first runlength
          * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@@ -571,10 -544,10 +571,10 @@@
         u8 code[0];
   } __packed;
   
- -struct p_delay_probe {
- -      struct p_header head;
- -      u32     seq_num; /* sequence number to match the two probe packets */
- -      u32     offset;  /* usecs the probe got sent after the reference time point */
+ +struct p_delay_probe93 {
+ +      struct p_header80 head;
+ +      u32     seq_num; /* sequence number to match the two probe packets */
+ +      u32     offset;  /* usecs the probe got sent after the reference time point */
   } __packed;
   
   /* DCBP: Drbd Compressed Bitmap Packet ... */
@@@ -621,7 -594,7 +621,7 @@@ DCBP_set_pad_bits(struct p_compressed_b
    * so we need to use the fixed size 4KiB page size
    * most architechtures have used for a long time.
    */
- -#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+ +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
   #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
   #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
   #if (PAGE_SIZE < 4096)
@@@ -630,14 -603,13 +630,14 @@@
   #endif
   
   union p_polymorph {
- -        struct p_header          header;
+ +        union p_header           header;
           struct p_handshake       handshake;
           struct p_data            data;
           struct p_block_ack       block_ack;
           struct p_barrier         barrier;
           struct p_barrier_ack     barrier_ack;
           struct p_rs_param_89     rs_param_89;
+ +        struct p_rs_param_95     rs_param_95;
           struct p_protocol        protocol;
           struct p_sizes           sizes;
           struct p_uuids           uuids;
@@@ -645,8 -617,6 +645,8 @@@
           struct p_req_state       req_state;
           struct p_req_state_reply req_state_reply;
           struct p_block_req       block_req;
+ +      struct p_delay_probe93   delay_probe93;
+ +      struct p_rs_uuid         rs_uuid;
   } __packed;
   
   /**********************************************************************/
@@@ -727,7 -697,7 +727,7 @@@ struct drbd_tl_epoch 
         struct list_head requests; /* requests before */
         struct drbd_tl_epoch *next; /* pointer to the next barrier */
         unsigned int br_number;  /* the barriers identifier. */
- -      int n_req;      /* number of requests attached before this barrier */
+ +      int n_writes;   /* number of requests attached before this barrier */
   };
   
   struct drbd_request;
@@@ -777,7 -747,7 +777,7 @@@ struct digest_info 
   struct drbd_epoch_entry {
         struct drbd_work w;
         struct hlist_node colision;
- -      struct drbd_epoch *epoch;
+ +      struct drbd_epoch *epoch; /* for writes */
         struct drbd_conf *mdev;
         struct page *pages;
         atomic_t pending_bios;
@@@ -785,10 -755,7 +785,10 @@@
         /* see comments on ee flag bits below */
         unsigned long flags;
         sector_t sector;
- -      u64 block_id;
+ +      union {
+ +              u64 block_id;
+ +              struct digest_info *digest;
+ +      };
   };
   
   /* ee flag bits.
@@@ -814,16 -781,12 +814,16 @@@ enum 
          * if any of those fail, we set this flag atomically
          * from the endio callback */
         __EE_WAS_ERROR,
+ +
+ +      /* This ee has a pointer to a digest instead of a block id */
+ +      __EE_HAS_DIGEST,
   };
   #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
   #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
   #define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
   #define       EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
   #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
+ +#define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
   
   /* global flag bits */
   enum {
@@@ -831,6 -794,7 +831,6 @@@
         SIGNAL_ASENDER,         /* whether asender wants to be interrupted */
         SEND_PING,              /* whether asender should send a ping asap */
   
- -      STOP_SYNC_TIMER,        /* tell timer to cancel itself */
         UNPLUG_QUEUED,          /* only relevant with kernel 2.4 */
         UNPLUG_REMOTE,          /* sending a "UnplugRemote" could help */
         MD_DIRTY,               /* current uuids and flags not yet on disk */
@@@ -852,7 -816,6 +852,7 @@@
         BITMAP_IO,              /* suspend application io;
                                    once no more io in flight, start bitmap io */
         BITMAP_IO_QUEUED,       /* Started bitmap IO */
+ +      GO_DISKLESS,            /* Disk failed, local_cnt reached zero, we are going diskless */
         RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
         NET_CONGESTED,          /* The data socket is congested */
   
@@@ -866,8 -829,6 +866,8 @@@
                                  * the peer, if it changed there as well. */
         CONN_DRY_RUN,           /* Expect disconnect after resync handshake. */
         GOT_PING_ACK,           /* set when we receive a ping_ack packet, misc wait gets woken */
+ +      NEW_CUR_UUID,           /* Create new current UUID when thawing IO */
+ +      AL_SUSPENDED,           /* Activity logging is currently suspended. */
   };
   
   struct drbd_bitmap; /* opaque for drbd_conf */
@@@ -877,6 -838,10 +877,6 @@@
   
   /* THINK maybe we actually want to use the default "event/%s" worker threads
    * or similar in linux 2.6, which uses per cpu data and threads.
- - *
- - * To be general, this might need a spin_lock member.
- - * For now, please use the mdev->req_lock to protect list_head,
- - * see drbd_queue_work below.
    */
   struct drbd_work_queue {
         struct list_head q;
@@@ -950,12 -915,6 +950,12 @@@ enum write_ordering_e 
         WO_bio_barrier
   };
   
+ +struct fifo_buffer {
+ +      int *values;
+ +      unsigned int head_index;
+ +      unsigned int size;
+ +};
+ +
   struct drbd_conf {
         /* things that are stored as / read from meta data on disk */
         unsigned long flags;
@@@ -977,16 -936,9 +977,16 @@@
         unsigned int ko_count;
         struct drbd_work  resync_work,
                           unplug_work,
+ +                        go_diskless,
                           md_sync_work;
         struct timer_list resync_timer;
         struct timer_list md_sync_timer;
+ +#ifdef DRBD_DEBUG_MD_SYNC
+ +      struct {
+ +              unsigned int line;
+ +              const char* func;
+ +      } last_md_mark_dirty;
+ +#endif
   
         /* Used after attach while negotiating new disk state. */
         union drbd_state new_state_tmp;
@@@ -994,7 -946,6 +994,7 @@@
         union drbd_state state;
         wait_queue_head_t misc_wait;
         wait_queue_head_t state_wait;  /* upon each state change. */
+ +      wait_queue_head_t net_cnt_wait;
         unsigned int send_cnt;
         unsigned int recv_cnt;
         unsigned int read_cnt;
@@@ -1023,16 -974,12 +1023,16 @@@
         unsigned long rs_start;
         /* cumulated time in PausedSyncX state [unit jiffies] */
         unsigned long rs_paused;
+ +      /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+ +      unsigned long rs_same_csum;
+ +#define DRBD_SYNC_MARKS 8
+ +#define DRBD_SYNC_MARK_STEP (3*HZ)
         /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
- -      unsigned long rs_mark_left;
+ +      unsigned long rs_mark_left[DRBD_SYNC_MARKS];
         /* marks's time [unit jiffies] */
- -      unsigned long rs_mark_time;
- -      /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
- -      unsigned long rs_same_csum;
+ +      unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+ +      /* current index into rs_mark_{left,time} */
+ +      int rs_last_mark;
   
         /* where does the admin want us to start? (sector) */
         sector_t ov_start_sector;
@@@ -1065,10 -1012,10 +1065,10 @@@
         spinlock_t epoch_lock;
         unsigned int epochs;
         enum write_ordering_e write_ordering;
- -      struct list_head active_ee; /* IO in progress */
- -      struct list_head sync_ee;   /* IO in progress */
+ +      struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+ +      struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
         struct list_head done_ee;   /* send ack */
- -      struct list_head read_ee;   /* IO in progress */
+ +      struct list_head read_ee;   /* IO in progress (any read) */
         struct list_head net_ee;    /* zero-copy network send in progress */
         struct hlist_head *ee_hash; /* is proteced by req_lock! */
         unsigned int ee_hash_s;
@@@ -1079,8 -1026,7 +1079,8 @@@
         int next_barrier_nr;
         struct hlist_head *app_reads_hash; /* is proteced by req_lock */
         struct list_head resync_reads;
- -      atomic_t pp_in_use;
+ +      atomic_t pp_in_use;             /* allocated from page pool */
+ +      atomic_t pp_in_use_by_net;      /* sendpage()d, still referenced by tcp */
         wait_queue_head_t ee_wait;
         struct page *md_io_page;        /* one page buffer for md_io */
         struct page *md_io_tmpp;        /* for logical_block_size != 512 */
@@@ -1108,15 -1054,6 +1108,15 @@@
         u64 ed_uuid; /* UUID of the exposed data */
         struct mutex state_mutex;
         char congestion_reason;  /* Why we where congested... */
+ +      atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ +      atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+ +      int rs_last_sect_ev; /* counter to compare with */
+ +      int rs_last_events;  /* counter of read or write "events" (unit sectors)
+ +                            * on the lower level device when we last looked. */
+ +      int c_sync_rate; /* current resync rate after syncer throttle magic */
+ +      struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ +      int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+ +      int rs_planed;    /* resync sectors already planed */
   };
   
   static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@@ -1201,8 -1138,6 +1201,8 @@@ extern void drbd_free_resources(struct 
   extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
                        unsigned int set_size);
   extern void tl_clear(struct drbd_conf *mdev);
+ +enum drbd_req_event;
+ +extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
   extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
   extern void drbd_free_sock(struct drbd_conf *mdev);
   extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
@@@ -1215,12 -1150,12 +1215,12 @@@ extern int drbd_send_sizes(struct drbd_
   extern int _drbd_send_state(struct drbd_conf *mdev);
   extern int drbd_send_state(struct drbd_conf *mdev);
   extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- -                      enum drbd_packets cmd, struct p_header *h,
+ +                      enum drbd_packets cmd, struct p_header80 *h,
                         size_t size, unsigned msg_flags);
   #define USE_DATA_SOCKET 1
   #define USE_META_SOCKET 0
   extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- -                      enum drbd_packets cmd, struct p_header *h,
+ +                      enum drbd_packets cmd, struct p_header80 *h,
                         size_t size);
   extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
                         char *data, size_t size);
@@@ -1232,7 -1167,7 +1232,7 @@@ extern int drbd_send_ack(struct drbd_co
   extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
                         struct p_block_req *rp);
   extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- -                      struct p_data *dp);
+ +                      struct p_data *dp, int data_size);
   extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
                             sector_t sector, int blksize, u64 block_id);
   extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
@@@ -1266,13 -1201,7 +1266,13 @@@ extern void drbd_uuid_set_bm(struct drb
   extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
   extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
   extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+ +#ifndef DRBD_DEBUG_MD_SYNC
   extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+ +#else
+ +#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
+ +extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
+ +              unsigned int line, const char *func);
+ +#endif
   extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
                                  int (*io_fn)(struct drbd_conf *),
                                  void (*done)(struct drbd_conf *, int),
@@@ -1280,7 -1209,6 +1280,7 @@@
   extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
   extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
   extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+ +extern void drbd_go_diskless(struct drbd_conf *mdev);
   
   
   /* Meta data layout
@@@ -1336,8 -1264,6 +1336,8 @@@ struct bm_extent 
    * Bit 1 ==> local node thinks this block needs to be synced.
    */
   
+ +#define SLEEP_TIME (HZ/10)
+ +
   #define BM_BLOCK_SHIFT  12                     /* 4k per bit */
   #define BM_BLOCK_SIZE  (1<<BM_BLOCK_SHIFT)
   /* (9+3) : 512 bytes @ 8 bits; representing 16M storage
@@@ -1409,13 -1335,11 +1409,13 @@@
   #endif
   
   /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- - * With a value of 6 all IO in one 32K block make it to the same slot of the
+ + * With a value of 8 all IO in one 128K block make it to the same slot of the
    * hash table. */
- -#define HT_SHIFT 6
+ +#define HT_SHIFT 8
   #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
   
+ +#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+ +
   /* Number of elements in the app_reads_hash */
   #define APP_R_HSIZE 15
   
@@@ -1445,7 -1369,6 +1445,7 @@@ extern unsigned long drbd_bm_find_next(
   /* bm_find_next variants for use while you hold drbd_bm_lock() */
   extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
   extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+ +extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
   extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
   extern int drbd_bm_rs_done(struct drbd_conf *mdev);
   /* for receive_bitmap */
@@@ -1498,8 -1421,7 +1498,8 @@@ extern void resync_after_online_grow(st
   extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
   extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
                 int force);
- -enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+ +extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+ +extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
   extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
   
   /* drbd_worker.c */
@@@ -1545,12 -1467,10 +1545,12 @@@ extern int w_send_barrier(struct drbd_c
   extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
   extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
   extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+ +extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
   
   extern void resync_timer_fn(unsigned long data);
   
   /* drbd_receiver.c */
+ +extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
   extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
                 const unsigned rw, const int fault_type);
   extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@@ -1559,10 -1479,7 +1559,10 @@@ extern struct drbd_epoch_entry *drbd_al
                                             sector_t sector,
                                             unsigned int data_size,
                                             gfp_t gfp_mask) __must_hold(local);
- -extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+ +extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+ +              int is_net);
+ +#define drbd_free_ee(m,e)     drbd_free_some_ee(m, e, 0)
+ +#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
   extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
                 struct list_head *head);
   extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
@@@ -1570,7 -1487,6 +1570,7 @@@
   extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
   extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
   extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+ +extern void drbd_free_tl_hash(struct drbd_conf *mdev);
   
   /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
    * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
@@@ -1684,8 -1600,6 +1684,8 @@@ void drbd_bcast_ee(struct drbd_conf *md
   #define susp_MASK 1
   #define user_isp_MASK 1
   #define aftr_isp_MASK 1
+ +#define susp_nod_MASK 1
+ +#define susp_fen_MASK 1
   
   #define NS(T, S) \
         ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
@@@ -1941,6 -1855,13 +1941,6 @@@ static inline sector_t drbd_md_ss__(str
         }
   }
   
- -static inline void
- -_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
- -{
- -      list_add_tail(&w->list, &q->q);
- -      up(&q->s);
- -}
- -
   static inline void
   drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
   {
@@@ -1978,19 -1899,19 +1978,19 @@@ static inline void request_ping(struct 
   static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
         enum drbd_packets cmd)
   {
- -      struct p_header h;
+ +      struct p_header80 h;
         return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
   }
   
   static inline int drbd_send_ping(struct drbd_conf *mdev)
   {
- -      struct p_header h;
+ +      struct p_header80 h;
         return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
   }
   
   static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
   {
- -      struct p_header h;
+ +      struct p_header80 h;
         return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
   }
   
@@@ -2092,7 -2013,7 +2092,7 @@@ static inline void inc_unacked(struct d
   static inline void put_net_conf(struct drbd_conf *mdev)
   {
         if (atomic_dec_and_test(&mdev->net_cnt))
- -              wake_up(&mdev->misc_wait);
+ +              wake_up(&mdev->net_cnt_wait);
   }
   
   /**
@@@ -2123,14 -2044,10 +2123,14 @@@ static inline int get_net_conf(struct d
   
   static inline void put_ldev(struct drbd_conf *mdev)
   {
+ +      int i = atomic_dec_return(&mdev->local_cnt);
         __release(local);
- -      if (atomic_dec_and_test(&mdev->local_cnt))
+ +      D_ASSERT(i >= 0);
+ +      if (i == 0) {
+ +              if (mdev->state.disk == D_FAILED)
+ +                      drbd_go_diskless(mdev);
                 wake_up(&mdev->misc_wait);
- -      D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
+ +      }
   }
   
   #ifndef __CHECKER__
@@@ -2262,16 -2179,11 +2262,16 @@@ static inline int drbd_state_is_stable(
         return 1;
   }
   
+ +static inline int is_susp(union drbd_state s)
+ +{
+ +      return s.susp || s.susp_nod || s.susp_fen;
+ +}
+ +
   static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
   {
         int mxb = drbd_get_max_buffers(mdev);
   
- -      if (mdev->state.susp)
+ +      if (is_susp(mdev->state))
                 return 0;
         if (test_bit(SUSPEND_IO, &mdev->flags))
                 return 0;
@@@ -2409,8 -2321,7 +2409,7 @@@ static inline void drbd_md_flush(struc
         if (test_bit(MD_NO_BARRIER, &mdev->flags))
                 return;
   
-       r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
-                       BLKDEV_IFL_WAIT);
+       r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
         if (r) {
                 set_bit(MD_NO_BARRIER, &mdev->flags);
                 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --combined drivers/block/drbd/drbd_receiver.c

index 760ae0df92516994c368b3a138d62c3f8a91893b,df15e7f0e7b766e0a031c71a8f3bf7275a7cfc47..efd6169acf2f04bf758c68ceee419549351c7e64
--- 1/drivers/block/drbd/drbd_receiver.c
--- 2/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@@ -241,7 -241,7 +241,7 @@@ static void drbd_kick_lo_and_reclaim_ne
         spin_unlock_irq(&mdev->req_lock);
   
         list_for_each_entry_safe(e, t, &reclaimed, w.list)
- -              drbd_free_ee(mdev, e);
+ +              drbd_free_net_ee(mdev, e);
   }
   
   /**
@@@ -298,11 -298,9 +298,11 @@@ static struct page *drbd_pp_alloc(struc
    * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
    * Either links the page chain back to the global pool,
    * or returns all pages to the system. */
- -static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+ +static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
   {
+ +      atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
         int i;
+ +
         if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
                 i = page_chain_free(page);
         else {
@@@ -313,10 -311,10 +313,10 @@@
                 drbd_pp_vacant += i;
                 spin_unlock(&drbd_pp_lock);
         }
- -      atomic_sub(i, &mdev->pp_in_use);
- -      i = atomic_read(&mdev->pp_in_use);
+ +      i = atomic_sub_return(i, a);
         if (i < 0)
- -              dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
+ +              dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+ +                      is_net ? "pp_in_use_by_net" : "pp_in_use", i);
         wake_up(&drbd_pp_wait);
   }
   
@@@ -367,6 -365,7 +367,6 @@@ struct drbd_epoch_entry *drbd_alloc_ee(
         e->size = data_size;
         e->flags = 0;
         e->sector = sector;
- -      e->sector = sector;
         e->block_id = id;
   
         return e;
@@@ -376,11 -375,9 +376,11 @@@
         return NULL;
   }
   
- -void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+ +void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
   {
- -      drbd_pp_free(mdev, e->pages);
+ +      if (e->flags & EE_HAS_DIGEST)
+ +              kfree(e->digest);
+ +      drbd_pp_free(mdev, e->pages, is_net);
         D_ASSERT(atomic_read(&e->pending_bios) == 0);
         D_ASSERT(hlist_unhashed(&e->colision));
         mempool_free(e, drbd_ee_mempool);
@@@ -391,14 -388,13 +391,14 @@@ int drbd_release_ee(struct drbd_conf *m
         LIST_HEAD(work_list);
         struct drbd_epoch_entry *e, *t;
         int count = 0;
+ +      int is_net = list == &mdev->net_ee;
   
         spin_lock_irq(&mdev->req_lock);
         list_splice_init(list, &work_list);
         spin_unlock_irq(&mdev->req_lock);
   
         list_for_each_entry_safe(e, t, &work_list, w.list) {
- -              drbd_free_ee(mdev, e);
+ +              drbd_free_some_ee(mdev, e, is_net);
                 count++;
         }
         return count;
@@@ -427,7 -423,7 +427,7 @@@ static int drbd_process_done_ee(struct 
         spin_unlock_irq(&mdev->req_lock);
   
         list_for_each_entry_safe(e, t, &reclaimed, w.list)
- -              drbd_free_ee(mdev, e);
+ +              drbd_free_net_ee(mdev, e);
   
         /* possible callbacks here:
          * e_end_block, and e_end_resync_block, e_send_discard_ack.
@@@ -723,14 -719,14 +723,14 @@@ out
   static int drbd_send_fp(struct drbd_conf *mdev,
         struct socket *sock, enum drbd_packets cmd)
   {
- -      struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ +      struct p_header80 *h = &mdev->data.sbuf.header.h80;
   
         return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
   }
   
   static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
   {
- -      struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ +      struct p_header80 *h = &mdev->data.rbuf.header.h80;
         int rr;
   
         rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@@ -780,6 -776,9 +780,6 @@@ static int drbd_connect(struct drbd_con
   
         D_ASSERT(!mdev->data.socket);
   
- -      if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
- -              dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
- -
         if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
                 return -2;
   
@@@ -928,11 -927,6 +928,11 @@@ retry
   
         drbd_thread_start(&mdev->asender);
   
+ +      if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
+ +              drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
+ +              put_ldev(mdev);
+ +      }
+ +
         if (!drbd_send_protocol(mdev))
                 return -1;
         drbd_send_sync_param(mdev, &mdev->sync_conf);
@@@ -952,28 -946,22 +952,28 @@@ out_release_sockets
         return -1;
   }
   
- -static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+ +static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
   {
+ +      union p_header *h = &mdev->data.rbuf.header;
         int r;
   
         r = drbd_recv(mdev, h, sizeof(*h));
- -
         if (unlikely(r != sizeof(*h))) {
                 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
                 return FALSE;
- -      };
- -      h->command = be16_to_cpu(h->command);
- -      h->length  = be16_to_cpu(h->length);
- -      if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- -              dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
- -                  (long)be32_to_cpu(h->magic),
- -                  h->command, h->length);
+ +      }
+ +
+ +      if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
+ +              *cmd = be16_to_cpu(h->h80.command);
+ +              *packet_size = be16_to_cpu(h->h80.length);
+ +      } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
+ +              *cmd = be16_to_cpu(h->h95.command);
+ +              *packet_size = be32_to_cpu(h->h95.length);
+ +      } else {
+ +              dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
+ +                  be32_to_cpu(h->h80.magic),
+ +                  be16_to_cpu(h->h80.command),
+ +                  be16_to_cpu(h->h80.length));
                 return FALSE;
         }
         mdev->last_received = jiffies;
@@@ -987,7 -975,7 +987,7 @@@ static enum finish_epoch drbd_flush_aft
   
         if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
                 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
-                                       NULL, BLKDEV_IFL_WAIT);
+                                       NULL);
                 if (rv) {
                         dev_err(DEV, "local disk flush failed with status %d\n", rv);
                         /* would rather check on EOPNOTSUPP, but that is not reliable.
@@@ -1280,12 -1268,17 +1280,12 @@@ int w_e_reissue(struct drbd_conf *mdev
         return 1;
   }
   
- -static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
         int rv, issue_flush;
- -      struct p_barrier *p = (struct p_barrier *)h;
+ +      struct p_barrier *p = &mdev->data.rbuf.barrier;
         struct drbd_epoch *epoch;
   
- -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- -
- -      rv = drbd_recv(mdev, h->payload, h->length);
- -      ERR_IF(rv != h->length) return FALSE;
- -
         inc_unacked(mdev);
   
         if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
@@@ -1464,7 -1457,7 +1464,7 @@@ static int drbd_drain_block(struct drbd
                 data_size -= rr;
         }
         kunmap(page);
- -      drbd_pp_free(mdev, page);
+ +      drbd_pp_free(mdev, page, 0);
         return rv;
   }
   
@@@ -1569,29 -1562,30 +1569,29 @@@ static int recv_resync_read(struct drbd
         list_add(&e->w.list, &mdev->sync_ee);
         spin_unlock_irq(&mdev->req_lock);
   
+ +      atomic_add(data_size >> 9, &mdev->rs_sect_ev);
         if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
                 return TRUE;
   
+ +      /* drbd_submit_ee currently fails for one reason only:
+ +       * not being able to allocate enough bios.
+ +       * Is dropping the connection going to help? */
+ +      spin_lock_irq(&mdev->req_lock);
+ +      list_del(&e->w.list);
+ +      spin_unlock_irq(&mdev->req_lock);
+ +
         drbd_free_ee(mdev, e);
   fail:
         put_ldev(mdev);
         return FALSE;
   }
   
- -static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
         struct drbd_request *req;
         sector_t sector;
- -      unsigned int header_size, data_size;
         int ok;
- -      struct p_data *p = (struct p_data *)h;
- -
- -      header_size = sizeof(*p) - sizeof(*h);
- -      data_size   = h->length  - header_size;
- -
- -      ERR_IF(data_size == 0) return FALSE;
- -
- -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
- -              return FALSE;
+ +      struct p_data *p = &mdev->data.rbuf.data;
   
         sector = be64_to_cpu(p->sector);
   
@@@ -1617,11 -1611,20 +1617,11 @@@
         return ok;
   }
   
- -static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
         sector_t sector;
- -      unsigned int header_size, data_size;
         int ok;
- -      struct p_data *p = (struct p_data *)h;
- -
- -      header_size = sizeof(*p) - sizeof(*h);
- -      data_size   = h->length  - header_size;
- -
- -      ERR_IF(data_size == 0) return FALSE;
- -
- -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
- -              return FALSE;
+ +      struct p_data *p = &mdev->data.rbuf.data;
   
         sector = be64_to_cpu(p->sector);
         D_ASSERT(p->block_id == ID_SYNCER);
@@@ -1637,11 -1640,9 +1637,11 @@@
   
                 ok = drbd_drain_block(mdev, data_size);
   
- -              drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ +              drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
         }
   
+ +      atomic_add(data_size >> 9, &mdev->rs_sect_in);
+ +
         return ok;
   }
   
@@@ -1764,27 -1765,24 +1764,27 @@@ static int drbd_wait_peer_seq(struct dr
         return ret;
   }
   
+ +static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+ +{
+ +      if (mdev->agreed_pro_version >= 95)
+ +              return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ +                      (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
+ +                      (dpf & DP_FUA ? REQ_FUA : 0) |
+ +                      (dpf & DP_FLUSH ? REQ_FUA : 0) |
+ +                      (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+ +      else
+ +              return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+ +}
+ +
   /* mirrored write */
- -static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
         sector_t sector;
         struct drbd_epoch_entry *e;
- -      struct p_data *p = (struct p_data *)h;
- -      int header_size, data_size;
+ +      struct p_data *p = &mdev->data.rbuf.data;
         int rw = WRITE;
         u32 dp_flags;
   
- -      header_size = sizeof(*p) - sizeof(*h);
- -      data_size   = h->length  - header_size;
- -
- -      ERR_IF(data_size == 0) return FALSE;
- -
- -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
- -              return FALSE;
- -
         if (!get_ldev(mdev)) {
                 if (__ratelimit(&drbd_ratelimit_state))
                         dev_err(DEV, "Can not write mirrored data block "
@@@ -1794,7 -1792,7 +1794,7 @@@
                         mdev->peer_seq++;
                 spin_unlock(&mdev->peer_seq_lock);
   
- -              drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ +              drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
                 atomic_inc(&mdev->current_epoch->epoch_size);
                 return drbd_drain_block(mdev, data_size);
         }
@@@ -1841,8 -1839,12 +1841,8 @@@
         spin_unlock(&mdev->epoch_lock);
   
         dp_flags = be32_to_cpu(p->dp_flags);
- -      if (dp_flags & DP_HARDBARRIER) {
- -              dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
- -              /* rw |= REQ_HARDBARRIER; */
- -      }
- -      if (dp_flags & DP_RW_SYNC)
- -              rw |= REQ_SYNC | REQ_UNPLUG;
+ +      rw |= write_flags_to_bio(mdev, dp_flags);
+ +
         if (dp_flags & DP_MAY_SET_IN_SYNC)
                 e->flags |= EE_MAY_SET_IN_SYNC;
   
@@@ -2005,16 -2007,6 +2005,16 @@@
         if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
                 return TRUE;
   
+ +      /* drbd_submit_ee currently fails for one reason only:
+ +       * not being able to allocate enough bios.
+ +       * Is dropping the connection going to help? */
+ +      spin_lock_irq(&mdev->req_lock);
+ +      list_del(&e->w.list);
+ +      hlist_del_init(&e->colision);
+ +      spin_unlock_irq(&mdev->req_lock);
+ +      if (e->flags & EE_CALL_AL_COMPLETE_IO)
+ +              drbd_al_complete_io(mdev, e->sector);
+ +
   out_interrupted:
         /* yes, the epoch_size now is imbalanced.
          * but we drop the connection anyways, so we don't have a chance to
@@@ -2024,64 -2016,20 +2024,64 @@@
         return FALSE;
   }
   
- -static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+ +/* We may throttle resync, if the lower device seems to be busy,
+ + * and current sync rate is above c_min_rate.
+ + *
+ + * To decide whether or not the lower device is busy, we use a scheme similar
+ + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ + * (more than 64 sectors) of activity we cannot account for with our own resync
+ + * activity, it obviously is "busy".
+ + *
+ + * The current sync rate used here uses only the most recent two step marks,
+ + * to have a short time average so we can react faster.
+ + */
+ +int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+ +{
+ +      struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+ +      unsigned long db, dt, dbdt;
+ +      int curr_events;
+ +      int throttle = 0;
+ +
+ +      /* feature disabled? */
+ +      if (mdev->sync_conf.c_min_rate == 0)
+ +              return 0;
+ +
+ +      curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ +                    (int)part_stat_read(&disk->part0, sectors[1]) -
+ +                      atomic_read(&mdev->rs_sect_ev);
+ +      if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+ +              unsigned long rs_left;
+ +              int i;
+ +
+ +              mdev->rs_last_events = curr_events;
+ +
+ +              /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+ +               * approx. */
+ +              i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
+ +              rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+ +
+ +              dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+ +              if (!dt)
+ +                      dt++;
+ +              db = mdev->rs_mark_left[i] - rs_left;
+ +              dbdt = Bit2KB(db/dt);
+ +
+ +              if (dbdt > mdev->sync_conf.c_min_rate)
+ +                      throttle = 1;
+ +      }
+ +      return throttle;
+ +}
+ +
+ +
+ +static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
   {
         sector_t sector;
         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
         struct drbd_epoch_entry *e;
         struct digest_info *di = NULL;
- -      int size, digest_size;
+ +      int size, verb;
         unsigned int fault_type;
- -      struct p_block_req *p =
- -              (struct p_block_req *)h;
- -      const int brps = sizeof(*p)-sizeof(*h);
- -
- -      if (drbd_recv(mdev, h->payload, brps) != brps)
- -              return FALSE;
+ +      struct p_block_req *p = &mdev->data.rbuf.block_req;
   
         sector = be64_to_cpu(p->sector);
         size   = be32_to_cpu(p->blksize);
@@@ -2098,31 -2046,12 +2098,31 @@@
         }
   
         if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
- -              if (__ratelimit(&drbd_ratelimit_state))
+ +              verb = 1;
+ +              switch (cmd) {
+ +              case P_DATA_REQUEST:
+ +                      drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+ +                      break;
+ +              case P_RS_DATA_REQUEST:
+ +              case P_CSUM_RS_REQUEST:
+ +              case P_OV_REQUEST:
+ +                      drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+ +                      break;
+ +              case P_OV_REPLY:
+ +                      verb = 0;
+ +                      dec_rs_pending(mdev);
+ +                      drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+ +                      break;
+ +              default:
+ +                      dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+ +                              cmdname(cmd));
+ +              }
+ +              if (verb && __ratelimit(&drbd_ratelimit_state))
                         dev_err(DEV, "Can not satisfy peer's read request, "
                             "no local data.\n");
- -              drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
- -                               P_NEG_RS_DREPLY , p);
- -              return drbd_drain_block(mdev, h->length - brps);
+ +
+ +              /* drain possibly payload */
+ +              return drbd_drain_block(mdev, digest_size);
         }
   
         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@@ -2134,21 -2063,31 +2134,21 @@@
                 return FALSE;
         }
   
- -      switch (h->command) {
+ +      switch (cmd) {
         case P_DATA_REQUEST:
                 e->w.cb = w_e_end_data_req;
                 fault_type = DRBD_FAULT_DT_RD;
- -              break;
+ +              /* application IO, don't drbd_rs_begin_io */
+ +              goto submit;
+ +
         case P_RS_DATA_REQUEST:
                 e->w.cb = w_e_end_rsdata_req;
                 fault_type = DRBD_FAULT_RS_RD;
- -              /* Eventually this should become asynchronously. Currently it
- -               * blocks the whole receiver just to delay the reading of a
- -               * resync data block.
- -               * the drbd_work_queue mechanism is made for this...
- -               */
- -              if (!drbd_rs_begin_io(mdev, sector)) {
- -                      /* we have been interrupted,
- -                       * probably connection lost! */
- -                      D_ASSERT(signal_pending(current));
- -                      goto out_free_e;
- -              }
                 break;
   
         case P_OV_REPLY:
         case P_CSUM_RS_REQUEST:
                 fault_type = DRBD_FAULT_RS_RD;
- -              digest_size = h->length - brps ;
                 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
                 if (!di)
                         goto out_free_e;
@@@ -2156,25 -2095,31 +2156,25 @@@
                 di->digest_size = digest_size;
                 di->digest = (((char *)di)+sizeof(struct digest_info));
   
+ +              e->digest = di;
+ +              e->flags |= EE_HAS_DIGEST;
+ +
                 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
                         goto out_free_e;
   
- -              e->block_id = (u64)(unsigned long)di;
- -              if (h->command == P_CSUM_RS_REQUEST) {
+ +              if (cmd == P_CSUM_RS_REQUEST) {
                         D_ASSERT(mdev->agreed_pro_version >= 89);
                         e->w.cb = w_e_end_csum_rs_req;
- -              } else if (h->command == P_OV_REPLY) {
+ +              } else if (cmd == P_OV_REPLY) {
                         e->w.cb = w_e_end_ov_reply;
                         dec_rs_pending(mdev);
- -                      break;
- -              }
- -
- -              if (!drbd_rs_begin_io(mdev, sector)) {
- -                      /* we have been interrupted, probably connection lost! */
- -                      D_ASSERT(signal_pending(current));
- -                      goto out_free_e;
+ +                      /* drbd_rs_begin_io done when we sent this request,
+ +                       * but accounting still needs to be done. */
+ +                      goto submit_for_resync;
                 }
                 break;
   
         case P_OV_REQUEST:
- -              if (mdev->state.conn >= C_CONNECTED &&
- -                  mdev->state.conn != C_VERIFY_T)
- -                      dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
- -                              drbd_conn_str(mdev->state.conn));
                 if (mdev->ov_start_sector == ~(sector_t)0 &&
                     mdev->agreed_pro_version >= 90) {
                         mdev->ov_start_sector = sector;
@@@ -2185,63 -2130,37 +2185,63 @@@
                 }
                 e->w.cb = w_e_end_ov_req;
                 fault_type = DRBD_FAULT_RS_RD;
- -              /* Eventually this should become asynchronous. Currently it
- -               * blocks the whole receiver just to delay the reading of a
- -               * resync data block.
- -               * the drbd_work_queue mechanism is made for this...
- -               */
- -              if (!drbd_rs_begin_io(mdev, sector)) {
- -                      /* we have been interrupted,
- -                       * probably connection lost! */
- -                      D_ASSERT(signal_pending(current));
- -                      goto out_free_e;
- -              }
                 break;
   
- -
         default:
                 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- -                  cmdname(h->command));
+ +                  cmdname(cmd));
                 fault_type = DRBD_FAULT_MAX;
+ +              goto out_free_e;
         }
   
- -      spin_lock_irq(&mdev->req_lock);
- -      list_add(&e->w.list, &mdev->read_ee);
- -      spin_unlock_irq(&mdev->req_lock);
+ +      /* Throttle, drbd_rs_begin_io and submit should become asynchronous
+ +       * wrt the receiver, but it is not as straightforward as it may seem.
+ +       * Various places in the resync start and stop logic assume resync
+ +       * requests are processed in order, requeuing this on the worker thread
+ +       * introduces a bunch of new code for synchronization between threads.
+ +       *
+ +       * Unlimited throttling before drbd_rs_begin_io may stall the resync
+ +       * "forever", throttling after drbd_rs_begin_io will lock that extent
+ +       * for application writes for the same time.  For now, just throttle
+ +       * here, where the rest of the code expects the receiver to sleep for
+ +       * a while, anyways.
+ +       */
+ +
+ +      /* Throttle before drbd_rs_begin_io, as that locks out application IO;
+ +       * this defers syncer requests for some time, before letting at least
+ +       * on request through.  The resync controller on the receiving side
+ +       * will adapt to the incoming rate accordingly.
+ +       *
+ +       * We cannot throttle here if remote is Primary/SyncTarget:
+ +       * we would also throttle its application reads.
+ +       * In that case, throttling is done on the SyncTarget only.
+ +       */
+ +      if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
+ +              msleep(100);
+ +      if (drbd_rs_begin_io(mdev, e->sector))
+ +              goto out_free_e;
   
+ +submit_for_resync:
+ +      atomic_add(size >> 9, &mdev->rs_sect_ev);
+ +
+ +submit:
         inc_unacked(mdev);
+ +      spin_lock_irq(&mdev->req_lock);
+ +      list_add_tail(&e->w.list, &mdev->read_ee);
+ +      spin_unlock_irq(&mdev->req_lock);
   
         if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
                 return TRUE;
   
+ +      /* drbd_submit_ee currently fails for one reason only:
+ +       * not being able to allocate enough bios.
+ +       * Is dropping the connection going to help? */
+ +      spin_lock_irq(&mdev->req_lock);
+ +      list_del(&e->w.list);
+ +      spin_unlock_irq(&mdev->req_lock);
+ +      /* no drbd_rs_complete_io(), we are dropping the connection anyways */
+ +
   out_free_e:
- -      kfree(di);
         put_ldev(mdev);
         drbd_free_ee(mdev, e);
         return FALSE;
@@@ -2780,13 -2699,20 +2780,13 @@@ static int cmp_after_sb(enum drbd_after
         return 1;
   }
   
- -static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
- -      struct p_protocol *p = (struct p_protocol *)h;
- -      int header_size, data_size;
+ +      struct p_protocol *p = &mdev->data.rbuf.protocol;
         int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
         int p_want_lose, p_two_primaries, cf;
         char p_integrity_alg[SHARED_SECRET_MAX] = "";
   
- -      header_size = sizeof(*p) - sizeof(*h);
- -      data_size   = h->length  - header_size;
- -
- -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
- -              return FALSE;
- -
         p_proto         = be32_to_cpu(p->protocol);
         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
@@@ -2879,46 -2805,39 +2879,46 @@@ struct crypto_hash *drbd_crypto_alloc_d
         return tfm;
   }
   
- -static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
   {
         int ok = TRUE;
- -      struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+ +      struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
         unsigned int header_size, data_size, exp_max_sz;
         struct crypto_hash *verify_tfm = NULL;
         struct crypto_hash *csums_tfm = NULL;
         const int apv = mdev->agreed_pro_version;
+ +      int *rs_plan_s = NULL;
+ +      int fifo_size = 0;
   
         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
                     : apv == 88 ? sizeof(struct p_rs_param)
                                         + SHARED_SECRET_MAX
- -                  : /* 89 */    sizeof(struct p_rs_param_89);
+ +                  : apv <= 94 ? sizeof(struct p_rs_param_89)
+ +                  : /* apv >= 95 */ sizeof(struct p_rs_param_95);
   
- -      if (h->length > exp_max_sz) {
+ +      if (packet_size > exp_max_sz) {
                 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
- -                  h->length, exp_max_sz);
+ +                  packet_size, exp_max_sz);
                 return FALSE;
         }
   
         if (apv <= 88) {
- -              header_size = sizeof(struct p_rs_param) - sizeof(*h);
- -              data_size   = h->length  - header_size;
- -      } else /* apv >= 89 */ {
- -              header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
- -              data_size   = h->length  - header_size;
+ +              header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
+ +              data_size   = packet_size  - header_size;
+ +      } else if (apv <= 94) {
+ +              header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
+ +              data_size   = packet_size  - header_size;
+ +              D_ASSERT(data_size == 0);
+ +      } else {
+ +              header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
+ +              data_size   = packet_size  - header_size;
                 D_ASSERT(data_size == 0);
         }
   
         /* initialize verify_alg and csums_alg */
         memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
   
- -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ +      if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
                 return FALSE;
   
         mdev->sync_conf.rate      = be32_to_cpu(p->rate);
@@@ -2977,22 -2896,6 +2977,22 @@@
                         }
                 }
   
+ +              if (apv > 94) {
+ +                      mdev->sync_conf.rate      = be32_to_cpu(p->rate);
+ +                      mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ +                      mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
+ +                      mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
+ +                      mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+ +
+ +                      fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ +                      if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+ +                              rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+ +                              if (!rs_plan_s) {
+ +                                      dev_err(DEV, "kmalloc of fifo_buffer failed");
+ +                                      goto disconnect;
+ +                              }
+ +                      }
+ +              }
   
                 spin_lock(&mdev->peer_seq_lock);
                 /* lock against drbd_nl_syncer_conf() */
@@@ -3010,12 -2913,6 +3010,12 @@@
                         mdev->csums_tfm = csums_tfm;
                         dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
                 }
+ +              if (fifo_size != mdev->rs_plan_s.size) {
+ +                      kfree(mdev->rs_plan_s.values);
+ +                      mdev->rs_plan_s.values = rs_plan_s;
+ +                      mdev->rs_plan_s.size   = fifo_size;
+ +                      mdev->rs_planed = 0;
+ +              }
                 spin_unlock(&mdev->peer_seq_lock);
         }
   
@@@ -3049,15 -2946,19 +3049,15 @@@ static void warn_if_differ_considerably
                      (unsigned long long)a, (unsigned long long)b);
   }
   
- -static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
- -      struct p_sizes *p = (struct p_sizes *)h;
+ +      struct p_sizes *p = &mdev->data.rbuf.sizes;
         enum determine_dev_size dd = unchanged;
         unsigned int max_seg_s;
         sector_t p_size, p_usize, my_usize;
         int ldsc = 0; /* local disk size changed */
         enum dds_flags ddsf;
   
- -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
- -              return FALSE;
- -
         p_size = be64_to_cpu(p->d_size);
         p_usize = be64_to_cpu(p->u_size);
   
@@@ -3071,6 -2972,7 +3071,6 @@@
          * we still need to figure out whether we accept that. */
         mdev->p_size = p_size;
   
- -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
         if (get_ldev(mdev)) {
                 warn_if_differ_considerably(mdev, "lower level device sizes",
                            p_size, drbd_get_max_capacity(mdev->ldev));
@@@ -3127,8 -3029,6 +3127,8 @@@
   
                 if (mdev->agreed_pro_version < 94)
                         max_seg_s = be32_to_cpu(p->max_segment_size);
+ +              else if (mdev->agreed_pro_version == 94)
+ +                      max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
                 else /* drbd 8.3.8 onwards */
                         max_seg_s = DRBD_MAX_SEGMENT_SIZE;
   
@@@ -3162,12 -3062,16 +3162,12 @@@
         return TRUE;
   }
   
- -static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
- -      struct p_uuids *p = (struct p_uuids *)h;
+ +      struct p_uuids *p = &mdev->data.rbuf.uuids;
         u64 *p_uuid;
         int i;
   
- -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
- -              return FALSE;
- -
         p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
   
         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
@@@ -3203,11 -3107,6 +3203,11 @@@
                         drbd_md_sync(mdev);
                 }
                 put_ldev(mdev);
+ +      } else if (mdev->state.disk < D_INCONSISTENT &&
+ +                 mdev->state.role == R_PRIMARY) {
+ +              /* I am a diskless primary, the peer just created a new current UUID
+ +                 for me. */
+ +              drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
         }
   
         /* Before we test for the disk state, we should wait until an eventually
@@@ -3251,12 -3150,16 +3251,12 @@@ static union drbd_state convert_state(u
         return ms;
   }
   
- -static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
- -      struct p_req_state *p = (struct p_req_state *)h;
+ +      struct p_req_state *p = &mdev->data.rbuf.req_state;
         union drbd_state mask, val;
         int rv;
   
- -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
- -              return FALSE;
- -
         mask.i = be32_to_cpu(p->mask);
         val.i = be32_to_cpu(p->val);
   
@@@ -3277,14 -3180,20 +3277,14 @@@
         return TRUE;
   }
   
- -static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
- -      struct p_state *p = (struct p_state *)h;
- -      enum drbd_conns nconn, oconn;
- -      union drbd_state ns, peer_state;
+ +      struct p_state *p = &mdev->data.rbuf.state;
+ +      union drbd_state os, ns, peer_state;
         enum drbd_disk_state real_peer_disk;
+ +      enum chg_state_flags cs_flags;
         int rv;
   
- -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
- -              return FALSE;
- -
- -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
- -              return FALSE;
- -
         peer_state.i = be32_to_cpu(p->state);
   
         real_peer_disk = peer_state.disk;
@@@ -3295,72 -3204,38 +3295,72 @@@
   
         spin_lock_irq(&mdev->req_lock);
    retry:
- -      oconn = nconn = mdev->state.conn;
+ +      os = ns = mdev->state;
         spin_unlock_irq(&mdev->req_lock);
   
- -      if (nconn == C_WF_REPORT_PARAMS)
- -              nconn = C_CONNECTED;
+ +      /* peer says his disk is uptodate, while we think it is inconsistent,
+ +       * and this happens while we think we have a sync going on. */
+ +      if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+ +          os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+ +              /* If we are (becoming) SyncSource, but peer is still in sync
+ +               * preparation, ignore its uptodate-ness to avoid flapping, it
+ +               * will change to inconsistent once the peer reaches active
+ +               * syncing states.
+ +               * It may have changed syncer-paused flags, however, so we
+ +               * cannot ignore this completely. */
+ +              if (peer_state.conn > C_CONNECTED &&
+ +                  peer_state.conn < C_SYNC_SOURCE)
+ +                      real_peer_disk = D_INCONSISTENT;
+ +
+ +              /* if peer_state changes to connected at the same time,
+ +               * it explicitly notifies us that it finished resync.
+ +               * Maybe we should finish it up, too? */
+ +              else if (os.conn >= C_SYNC_SOURCE &&
+ +                       peer_state.conn == C_CONNECTED) {
+ +                      if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
+ +                              drbd_resync_finished(mdev);
+ +                      return TRUE;
+ +              }
+ +      }
+ +
+ +      /* peer says his disk is inconsistent, while we think it is uptodate,
+ +       * and this happens while the peer still thinks we have a sync going on,
+ +       * but we think we are already done with the sync.
+ +       * We ignore this to avoid flapping pdsk.
+ +       * This should not happen, if the peer is a recent version of drbd. */
+ +      if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+ +          os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+ +              real_peer_disk = D_UP_TO_DATE;
+ +
+ +      if (ns.conn == C_WF_REPORT_PARAMS)
+ +              ns.conn = C_CONNECTED;
   
         if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
             get_ldev_if_state(mdev, D_NEGOTIATING)) {
                 int cr; /* consider resync */
   
                 /* if we established a new connection */
- -              cr  = (oconn < C_CONNECTED);
+ +              cr  = (os.conn < C_CONNECTED);
                 /* if we had an established connection
                  * and one of the nodes newly attaches a disk */
- -              cr |= (oconn == C_CONNECTED &&
+ +              cr |= (os.conn == C_CONNECTED &&
                        (peer_state.disk == D_NEGOTIATING ||
- -                      mdev->state.disk == D_NEGOTIATING));
+ +                      os.disk == D_NEGOTIATING));
                 /* if we have both been inconsistent, and the peer has been
                  * forced to be UpToDate with --overwrite-data */
                 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
                 /* if we had been plain connected, and the admin requested to
                  * start a sync by "invalidate" or "invalidate-remote" */
- -              cr |= (oconn == C_CONNECTED &&
+ +              cr |= (os.conn == C_CONNECTED &&
                                 (peer_state.conn >= C_STARTING_SYNC_S &&
                                  peer_state.conn <= C_WF_BITMAP_T));
   
                 if (cr)
- -                      nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+ +                      ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
   
                 put_ldev(mdev);
- -              if (nconn == C_MASK) {
- -                      nconn = C_CONNECTED;
+ +              if (ns.conn == C_MASK) {
+ +                      ns.conn = C_CONNECTED;
                         if (mdev->state.disk == D_NEGOTIATING) {
                                 drbd_force_state(mdev, NS(disk, D_DISKLESS));
                         } else if (peer_state.disk == D_NEGOTIATING) {
@@@ -3370,7 -3245,7 +3370,7 @@@
                         } else {
                                 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
                                         return FALSE;
- -                              D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+ +                              D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
                                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
                                 return FALSE;
                         }
@@@ -3378,28 -3253,18 +3378,28 @@@
         }
   
         spin_lock_irq(&mdev->req_lock);
- -      if (mdev->state.conn != oconn)
+ +      if (mdev->state.i != os.i)
                 goto retry;
         clear_bit(CONSIDER_RESYNC, &mdev->flags);
- -      ns.i = mdev->state.i;
- -      ns.conn = nconn;
         ns.peer = peer_state.role;
         ns.pdsk = real_peer_disk;
         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
- -      if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+ +      if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
                 ns.disk = mdev->new_state_tmp.disk;
- -
- -      rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+ +      cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+ +      if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ +          test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ +              /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+ +                 for temporal network outages! */
+ +              spin_unlock_irq(&mdev->req_lock);
+ +              dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+ +              tl_clear(mdev);
+ +              drbd_uuid_new_current(mdev);
+ +              clear_bit(NEW_CUR_UUID, &mdev->flags);
+ +              drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
+ +              return FALSE;
+ +      }
+ +      rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
         ns = mdev->state;
         spin_unlock_irq(&mdev->req_lock);
   
@@@ -3408,8 -3273,8 +3408,8 @@@
                 return FALSE;
         }
   
- -      if (oconn > C_WF_REPORT_PARAMS) {
- -              if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+ +      if (os.conn > C_WF_REPORT_PARAMS) {
+ +              if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
                     peer_state.disk != D_NEGOTIATING ) {
                         /* we want resync, peer has not yet decided to sync... */
                         /* Nowadays only used when forcing a node into primary role and
@@@ -3426,9 -3291,9 +3426,9 @@@
         return TRUE;
   }
   
- -static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
- -      struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+ +      struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
   
         wait_event(mdev->misc_wait,
                    mdev->state.conn == C_WF_SYNC_UUID ||
@@@ -3437,6 -3302,10 +3437,6 @@@
   
         /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
   
- -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
- -              return FALSE;
- -
         /* Here the _drbd_uuid_ functions are right, current should
            _not_ be rotated into the history */
         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@@ -3455,14 -3324,14 +3455,14 @@@
   enum receive_bitmap_ret { OK, DONE, FAILED };
   
   static enum receive_bitmap_ret
- -receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
- -      unsigned long *buffer, struct bm_xfer_ctx *c)
+ +receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
+ +                   unsigned long *buffer, struct bm_xfer_ctx *c)
   {
         unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
         unsigned want = num_words * sizeof(long);
   
- -      if (want != h->length) {
- -              dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+ +      if (want != data_size) {
+ +              dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
                 return FAILED;
         }
         if (want == 0)
@@@ -3491,7 -3360,7 +3491,7 @@@ recv_bm_rle_bits(struct drbd_conf *mdev
         u64 tmp;
         unsigned long s = c->bit_offset;
         unsigned long e;
- -      int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+ +      int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
         int toggle = DCBP_get_start(p);
         int have;
         int bits;
@@@ -3560,7 -3429,7 +3560,7 @@@ void INFO_bm_xfer_stats(struct drbd_con
                 const char *direction, struct bm_xfer_ctx *c)
   {
         /* what would it take to transfer it "plaintext" */
- -      unsigned plain = sizeof(struct p_header) *
+ +      unsigned plain = sizeof(struct p_header80) *
                 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
                 + c->bm_words * sizeof(long);
         unsigned total = c->bytes[0] + c->bytes[1];
@@@ -3598,13 -3467,12 +3598,13 @@@
      in order to be agnostic to the 32 vs 64 bits issue.
   
      returns 0 on failure, 1 if we successfully received it. */
- -static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
         struct bm_xfer_ctx c;
         void *buffer;
         enum receive_bitmap_ret ret;
         int ok = FALSE;
+ +      struct p_header80 *h = &mdev->data.rbuf.header.h80;
   
         wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
   
@@@ -3624,39 -3492,39 +3624,39 @@@
         };
   
         do {
- -              if (h->command == P_BITMAP) {
- -                      ret = receive_bitmap_plain(mdev, h, buffer, &c);
- -              } else if (h->command == P_COMPRESSED_BITMAP) {
+ +              if (cmd == P_BITMAP) {
+ +                      ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
+ +              } else if (cmd == P_COMPRESSED_BITMAP) {
                         /* MAYBE: sanity check that we speak proto >= 90,
                          * and the feature is enabled! */
                         struct p_compressed_bm *p;
   
- -                      if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+ +                      if (data_size > BM_PACKET_PAYLOAD_BYTES) {
                                 dev_err(DEV, "ReportCBitmap packet too large\n");
                                 goto out;
                         }
                         /* use the page buff */
                         p = buffer;
                         memcpy(p, h, sizeof(*h));
- -                      if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+ +                      if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
                                 goto out;
- -                      if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
- -                              dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+ +                      if (data_size <= (sizeof(*p) - sizeof(p->head))) {
+ +                              dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
                                 return FAILED;
                         }
                         ret = decode_bitmap_c(mdev, p, &c);
                 } else {
- -                      dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+ +                      dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
                         goto out;
                 }
   
- -              c.packets[h->command == P_BITMAP]++;
- -              c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+ +              c.packets[cmd == P_BITMAP]++;
+ +              c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
   
                 if (ret != OK)
                         break;
   
- -              if (!drbd_recv_header(mdev, h))
+ +              if (!drbd_recv_header(mdev, &cmd, &data_size))
                         goto out;
         } while (ret == OK);
         if (ret == FAILED)
@@@ -3687,16 -3555,17 +3687,16 @@@
         return ok;
   }
   
- -static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
+ +static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
         /* TODO zero copy sink :) */
         static char sink[128];
         int size, want, r;
   
- -      if (!silent)
- -              dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
- -                   h->command, h->length);
+ +      dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+ +               cmd, data_size);
   
- -      size = h->length;
+ +      size = data_size;
         while (size > 0) {
                 want = min_t(int, size, sizeof(sink));
                 r = drbd_recv(mdev, sink, want);
@@@ -3706,7 -3575,17 +3706,7 @@@
         return size == 0;
   }
   
- -static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
- -{
- -      return receive_skip_(mdev, h, 0);
- -}
- -
- -static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
- -{
- -      return receive_skip_(mdev, h, 1);
- -}
- -
- -static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+ +static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
   {
         if (mdev->state.disk >= D_INCONSISTENT)
                 drbd_kick_lo(mdev);
@@@ -3718,94 -3597,108 +3718,94 @@@
         return TRUE;
   }
   
- -typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
- -
- -static drbd_cmd_handler_f drbd_default_handler[] = {
- -      [P_DATA]            = receive_Data,
- -      [P_DATA_REPLY]      = receive_DataReply,
- -      [P_RS_DATA_REPLY]   = receive_RSDataReply,
- -      [P_BARRIER]         = receive_Barrier,
- -      [P_BITMAP]          = receive_bitmap,
- -      [P_COMPRESSED_BITMAP]    = receive_bitmap,
- -      [P_UNPLUG_REMOTE]   = receive_UnplugRemote,
- -      [P_DATA_REQUEST]    = receive_DataRequest,
- -      [P_RS_DATA_REQUEST] = receive_DataRequest,
- -      [P_SYNC_PARAM]      = receive_SyncParam,
- -      [P_SYNC_PARAM89]           = receive_SyncParam,
- -      [P_PROTOCOL]        = receive_protocol,
- -      [P_UUIDS]           = receive_uuids,
- -      [P_SIZES]           = receive_sizes,
- -      [P_STATE]           = receive_state,
- -      [P_STATE_CHG_REQ]   = receive_req_state,
- -      [P_SYNC_UUID]       = receive_sync_uuid,
- -      [P_OV_REQUEST]      = receive_DataRequest,
- -      [P_OV_REPLY]        = receive_DataRequest,
- -      [P_CSUM_RS_REQUEST]    = receive_DataRequest,
- -      [P_DELAY_PROBE]     = receive_skip_silent,
+ +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
+ +
+ +struct data_cmd {
+ +      int expect_payload;
+ +      size_t pkt_size;
+ +      drbd_cmd_handler_f function;
+ +};
+ +
+ +static struct data_cmd drbd_cmd_handler[] = {
+ +      [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
+ +      [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
+ +      [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+ +      [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+ +      [P_BITMAP]          = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ +      [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ +      [P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+ +      [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ +      [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ +      [P_SYNC_PARAM]      = { 1, sizeof(struct p_header80), receive_SyncParam },
+ +      [P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
+ +      [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
+ +      [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
+ +      [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
+ +      [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
+ +      [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
+ +      [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+ +      [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ +      [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ +      [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ +      [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
         /* anything missing from this table is in
          * the asender_tbl, see get_asender_cmd */
- -      [P_MAX_CMD]         = NULL,
+ +      [P_MAX_CMD]         = { 0, 0, NULL },
   };
   
- -static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
- -static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+ +/* All handler functions that expect a sub-header get that sub-heder in
+ +   mdev->data.rbuf.header.head.payload.
+ +
+ +   Usually in mdev->data.rbuf.header.head the callback can find the usual
+ +   p_header, but they may not rely on that. Since there is also p_header95 !
+ + */
   
   static void drbdd(struct drbd_conf *mdev)
   {
- -      drbd_cmd_handler_f handler;
- -      struct p_header *header = &mdev->data.rbuf.header;
+ +      union p_header *header = &mdev->data.rbuf.header;
+ +      unsigned int packet_size;
+ +      enum drbd_packets cmd;
+ +      size_t shs; /* sub header size */
+ +      int rv;
   
         while (get_t_state(&mdev->receiver) == Running) {
                 drbd_thread_current_set_cpu(mdev);
- -              if (!drbd_recv_header(mdev, header)) {
- -                      drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- -                      break;
- -              }
+ +              if (!drbd_recv_header(mdev, &cmd, &packet_size))
+ +                      goto err_out;
   
- -              if (header->command < P_MAX_CMD)
- -                      handler = drbd_cmd_handler[header->command];
- -              else if (P_MAY_IGNORE < header->command
- -                   && header->command < P_MAX_OPT_CMD)
- -                      handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
- -              else if (header->command > P_MAX_OPT_CMD)
- -                      handler = receive_skip;
- -              else
- -                      handler = NULL;
+ +              if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
+ +                      dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+ +                      goto err_out;
+ +              }
   
- -              if (unlikely(!handler)) {
- -                      dev_err(DEV, "unknown packet type %d, l: %d!\n",
- -                          header->command, header->length);
- -                      drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- -                      break;
+ +              shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
+ +              rv = drbd_recv(mdev, &header->h80.payload, shs);
+ +              if (unlikely(rv != shs)) {
+ +                      dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
+ +                      goto err_out;
                 }
- -              if (unlikely(!handler(mdev, header))) {
- -                      dev_err(DEV, "error receiving %s, l: %d!\n",
- -                          cmdname(header->command), header->length);
- -                      drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- -                      break;
+ +
+ +              if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
+ +                      dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+ +                      goto err_out;
                 }
- -      }
- -}
   
- -static void drbd_fail_pending_reads(struct drbd_conf *mdev)
- -{
- -      struct hlist_head *slot;
- -      struct hlist_node *pos;
- -      struct hlist_node *tmp;
- -      struct drbd_request *req;
- -      int i;
+ +              rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
   
- -      /*
- -       * Application READ requests
- -       */
- -      spin_lock_irq(&mdev->req_lock);
- -      for (i = 0; i < APP_R_HSIZE; i++) {
- -              slot = mdev->app_reads_hash+i;
- -              hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
- -                      /* it may (but should not any longer!)
- -                       * be on the work queue; if that assert triggers,
- -                       * we need to also grab the
- -                       * spin_lock_irq(&mdev->data.work.q_lock);
- -                       * and list_del_init here. */
- -                      D_ASSERT(list_empty(&req->w.list));
- -                      /* It would be nice to complete outside of spinlock.
- -                       * But this is easier for now. */
- -                      _req_mod(req, connection_lost_while_pending);
+ +              if (unlikely(!rv)) {
+ +                      dev_err(DEV, "error receiving %s, l: %d!\n",
+ +                          cmdname(cmd), packet_size);
+ +                      goto err_out;
                 }
         }
- -      for (i = 0; i < APP_R_HSIZE; i++)
- -              if (!hlist_empty(mdev->app_reads_hash+i))
- -                      dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
- -                              "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
   
- -      memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
- -      spin_unlock_irq(&mdev->req_lock);
+ +      if (0) {
+ +      err_out:
+ +              drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ +      }
+ +      /* If we leave here, we probably want to update at least the
+ +       * "Connected" indicator on stable storage. Do so explicitly here. */
+ +      drbd_md_sync(mdev);
   }
   
   void drbd_flush_workqueue(struct drbd_conf *mdev)
@@@ -3818,36 -3711,6 +3818,36 @@@
         wait_for_completion(&barr.done);
   }
   
+ +void drbd_free_tl_hash(struct drbd_conf *mdev)
+ +{
+ +      struct hlist_head *h;
+ +
+ +      spin_lock_irq(&mdev->req_lock);
+ +
+ +      if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
+ +              spin_unlock_irq(&mdev->req_lock);
+ +              return;
+ +      }
+ +      /* paranoia code */
+ +      for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+ +              if (h->first)
+ +                      dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+ +                              (int)(h - mdev->ee_hash), h->first);
+ +      kfree(mdev->ee_hash);
+ +      mdev->ee_hash = NULL;
+ +      mdev->ee_hash_s = 0;
+ +
+ +      /* paranoia code */
+ +      for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+ +              if (h->first)
+ +                      dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+ +                              (int)(h - mdev->tl_hash), h->first);
+ +      kfree(mdev->tl_hash);
+ +      mdev->tl_hash = NULL;
+ +      mdev->tl_hash_s = 0;
+ +      spin_unlock_irq(&mdev->req_lock);
+ +}
+ +
   static void drbd_disconnect(struct drbd_conf *mdev)
   {
         enum drbd_fencing_p fp;
@@@ -3865,7 -3728,6 +3865,7 @@@
         drbd_thread_stop(&mdev->asender);
         drbd_free_sock(mdev);
   
+ +      /* wait for current activity to cease. */
         spin_lock_irq(&mdev->req_lock);
         _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
         _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
@@@ -3890,6 -3752,7 +3890,6 @@@
   
         /* make sure syncer is stopped and w_resume_next_sg queued */
         del_timer_sync(&mdev->resync_timer);
- -      set_bit(STOP_SYNC_TIMER, &mdev->flags);
         resync_timer_fn((unsigned long)mdev);
   
         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
@@@ -3904,9 -3767,11 +3904,9 @@@
         kfree(mdev->p_uuid);
         mdev->p_uuid = NULL;
   
- -      if (!mdev->state.susp)
+ +      if (!is_susp(mdev->state))
                 tl_clear(mdev);
   
- -      drbd_fail_pending_reads(mdev);
- -
         dev_info(DEV, "Connection closed\n");
   
         drbd_md_sync(mdev);
@@@ -3917,8 -3782,12 +3917,8 @@@
                 put_ldev(mdev);
         }
   
- -      if (mdev->state.role == R_PRIMARY) {
- -              if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
- -                      enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
- -                      drbd_request_state(mdev, NS(pdsk, nps));
- -              }
- -      }
+ +      if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
+ +              drbd_try_outdate_peer_async(mdev);
   
         spin_lock_irq(&mdev->req_lock);
         os = mdev->state;
@@@ -3931,14 -3800,32 +3931,14 @@@
         spin_unlock_irq(&mdev->req_lock);
   
         if (os.conn == C_DISCONNECTING) {
- -              struct hlist_head *h;
- -              wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+ +              wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
   
- -              /* we must not free the tl_hash
- -               * while application io is still on the fly */
- -              wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
- -
- -              spin_lock_irq(&mdev->req_lock);
- -              /* paranoia code */
- -              for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
- -                      if (h->first)
- -                              dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
- -                                              (int)(h - mdev->ee_hash), h->first);
- -              kfree(mdev->ee_hash);
- -              mdev->ee_hash = NULL;
- -              mdev->ee_hash_s = 0;
- -
- -              /* paranoia code */
- -              for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
- -                      if (h->first)
- -                              dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
- -                                              (int)(h - mdev->tl_hash), h->first);
- -              kfree(mdev->tl_hash);
- -              mdev->tl_hash = NULL;
- -              mdev->tl_hash_s = 0;
- -              spin_unlock_irq(&mdev->req_lock);
+ +              if (!is_susp(mdev->state)) {
+ +                      /* we must not free the tl_hash
+ +                       * while application io is still on the fly */
+ +                      wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+ +                      drbd_free_tl_hash(mdev);
+ +              }
   
                 crypto_free_hash(mdev->cram_hmac_tfm);
                 mdev->cram_hmac_tfm = NULL;
@@@ -3958,9 -3845,6 +3958,9 @@@
         i = drbd_release_ee(mdev, &mdev->net_ee);
         if (i)
                 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+ +      i = atomic_read(&mdev->pp_in_use_by_net);
+ +      if (i)
+ +              dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
         i = atomic_read(&mdev->pp_in_use);
         if (i)
                 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
@@@ -4004,7 -3888,7 +4004,7 @@@ static int drbd_send_handshake(struct d
         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
         ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
- -                           (struct p_header *)p, sizeof(*p), 0 );
+ +                           (struct p_header80 *)p, sizeof(*p), 0 );
         mutex_unlock(&mdev->data.mutex);
         return ok;
   }
@@@ -4020,28 -3904,27 +4020,28 @@@ static int drbd_do_handshake(struct drb
   {
         /* ASSERT current == mdev->receiver ... */
         struct p_handshake *p = &mdev->data.rbuf.handshake;
- -      const int expect = sizeof(struct p_handshake)
- -                        -sizeof(struct p_header);
+ +      const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
+ +      unsigned int length;
+ +      enum drbd_packets cmd;
         int rv;
   
         rv = drbd_send_handshake(mdev);
         if (!rv)
                 return 0;
   
- -      rv = drbd_recv_header(mdev, &p->head);
+ +      rv = drbd_recv_header(mdev, &cmd, &length);
         if (!rv)
                 return 0;
   
- -      if (p->head.command != P_HAND_SHAKE) {
+ +      if (cmd != P_HAND_SHAKE) {
                 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
- -                   cmdname(p->head.command), p->head.command);
+ +                   cmdname(cmd), cmd);
                 return -1;
         }
   
- -      if (p->head.length != expect) {
+ +      if (length != expect) {
                 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
- -                   expect, p->head.length);
+ +                   expect, length);
                 return -1;
         }
   
@@@ -4099,11 -3982,10 +4099,11 @@@ static int drbd_do_auth(struct drbd_con
         char *response = NULL;
         char *right_response = NULL;
         char *peers_ch = NULL;
- -      struct p_header p;
         unsigned int key_len = strlen(mdev->net_conf->shared_secret);
         unsigned int resp_size;
         struct hash_desc desc;
+ +      enum drbd_packets cmd;
+ +      unsigned int length;
         int rv;
   
         desc.tfm = mdev->cram_hmac_tfm;
@@@ -4123,33 -4005,33 +4123,33 @@@
         if (!rv)
                 goto fail;
   
- -      rv = drbd_recv_header(mdev, &p);
+ +      rv = drbd_recv_header(mdev, &cmd, &length);
         if (!rv)
                 goto fail;
   
- -      if (p.command != P_AUTH_CHALLENGE) {
+ +      if (cmd != P_AUTH_CHALLENGE) {
                 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
- -                  cmdname(p.command), p.command);
+ +                  cmdname(cmd), cmd);
                 rv = 0;
                 goto fail;
         }
   
- -      if (p.length > CHALLENGE_LEN*2) {
+ +      if (length > CHALLENGE_LEN * 2) {
                 dev_err(DEV, "expected AuthChallenge payload too big.\n");
                 rv = -1;
                 goto fail;
         }
   
- -      peers_ch = kmalloc(p.length, GFP_NOIO);
+ +      peers_ch = kmalloc(length, GFP_NOIO);
         if (peers_ch == NULL) {
                 dev_err(DEV, "kmalloc of peers_ch failed\n");
                 rv = -1;
                 goto fail;
         }
   
- -      rv = drbd_recv(mdev, peers_ch, p.length);
+ +      rv = drbd_recv(mdev, peers_ch, length);
   
- -      if (rv != p.length) {
+ +      if (rv != length) {
                 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
                 rv = 0;
                 goto fail;
@@@ -4164,7 -4046,7 +4164,7 @@@
         }
   
         sg_init_table(&sg, 1);
- -      sg_set_buf(&sg, peers_ch, p.length);
+ +      sg_set_buf(&sg, peers_ch, length);
   
         rv = crypto_hash_digest(&desc, &sg, sg.length, response);
         if (rv) {
@@@ -4177,18 -4059,18 +4177,18 @@@
         if (!rv)
                 goto fail;
   
- -      rv = drbd_recv_header(mdev, &p);
+ +      rv = drbd_recv_header(mdev, &cmd, &length);
         if (!rv)
                 goto fail;
   
- -      if (p.command != P_AUTH_RESPONSE) {
+ +      if (cmd != P_AUTH_RESPONSE) {
                 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
- -                  cmdname(p.command), p.command);
+ +                      cmdname(cmd), cmd);
                 rv = 0;
                 goto fail;
         }
   
- -      if (p.length != resp_size) {
+ +      if (length != resp_size) {
                 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
                 rv = 0;
                 goto fail;
@@@ -4273,7 -4155,7 +4273,7 @@@ int drbdd_init(struct drbd_thread *thi
   
   /* ********* acknowledge sender ******** */
   
- -static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
   {
         struct p_req_state_reply *p = (struct p_req_state_reply *)h;
   
@@@ -4291,13 -4173,13 +4291,13 @@@
         return TRUE;
   }
   
- -static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
   {
         return drbd_send_ping_ack(mdev);
   
   }
   
- -static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
   {
         /* restore idle timeout */
         mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
@@@ -4307,7 -4189,7 +4307,7 @@@
         return TRUE;
   }
   
- -static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
   {
         struct p_block_ack *p = (struct p_block_ack *)h;
         sector_t sector = be64_to_cpu(p->sector);
@@@ -4317,15 -4199,11 +4317,15 @@@
   
         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
   
- -      drbd_rs_complete_io(mdev, sector);
- -      drbd_set_in_sync(mdev, sector, blksize);
- -      /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
- -      mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ +      if (get_ldev(mdev)) {
+ +              drbd_rs_complete_io(mdev, sector);
+ +              drbd_set_in_sync(mdev, sector, blksize);
+ +              /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+ +              mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ +              put_ldev(mdev);
+ +      }
         dec_rs_pending(mdev);
+ +      atomic_add(blksize >> 9, &mdev->rs_sect_in);
   
         return TRUE;
   }
@@@ -4381,7 -4259,7 +4381,7 @@@ static int validate_req_change_req_stat
         return TRUE;
   }
   
- -static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
   {
         struct p_block_ack *p = (struct p_block_ack *)h;
         sector_t sector = be64_to_cpu(p->sector);
@@@ -4421,7 -4299,7 +4421,7 @@@
                 _ack_id_to_req, __func__ , what);
   }
   
- -static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
   {
         struct p_block_ack *p = (struct p_block_ack *)h;
         sector_t sector = be64_to_cpu(p->sector);
@@@ -4441,7 -4319,7 +4441,7 @@@
                 _ack_id_to_req, __func__ , neg_acked);
   }
   
- -static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
   {
         struct p_block_ack *p = (struct p_block_ack *)h;
         sector_t sector = be64_to_cpu(p->sector);
@@@ -4454,7 -4332,7 +4454,7 @@@
                 _ar_id_to_req, __func__ , neg_acked);
   }
   
- -static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
   {
         sector_t sector;
         int size;
@@@ -4476,7 -4354,7 +4476,7 @@@
         return TRUE;
   }
   
- -static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
   {
         struct p_barrier_ack *p = (struct p_barrier_ack *)h;
   
@@@ -4485,7 -4363,7 +4485,7 @@@
         return TRUE;
   }
   
- -static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
   {
         struct p_block_ack *p = (struct p_block_ack *)h;
         struct drbd_work *w;
@@@ -4502,9 -4380,6 +4502,9 @@@
         else
                 ov_oos_print(mdev);
   
+ +      if (!get_ldev(mdev))
+ +              return TRUE;
+ +
         drbd_rs_complete_io(mdev, sector);
         dec_rs_pending(mdev);
   
@@@ -4519,18 -4394,18 +4519,18 @@@
                         drbd_resync_finished(mdev);
                 }
         }
+ +      put_ldev(mdev);
         return TRUE;
   }
   
- -static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
+ +static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
   {
- -      /* IGNORE */
         return TRUE;
   }
   
   struct asender_cmd {
         size_t pkt_size;
- -      int (*process)(struct drbd_conf *mdev, struct p_header *h);
+ +      int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
   };
   
   static struct asender_cmd *get_asender_cmd(int cmd)
@@@ -4539,8 -4414,8 +4539,8 @@@
                 /* anything missing from this table is in
                  * the drbd_cmd_handler (drbd_default_handler) table,
                  * see the beginning of drbdd() */
- -      [P_PING]            = { sizeof(struct p_header), got_Ping },
- -      [P_PING_ACK]        = { sizeof(struct p_header), got_PingAck },
+ +      [P_PING]            = { sizeof(struct p_header80), got_Ping },
+ +      [P_PING_ACK]        = { sizeof(struct p_header80), got_PingAck },
         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
@@@ -4552,7 -4427,7 +4552,7 @@@
         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
- -      [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
+ +      [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
         [P_MAX_CMD]         = { 0, NULL },
         };
         if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
@@@ -4563,13 -4438,13 +4563,13 @@@
   int drbd_asender(struct drbd_thread *thi)
   {
         struct drbd_conf *mdev = thi->mdev;
- -      struct p_header *h = &mdev->meta.rbuf.header;
+ +      struct p_header80 *h = &mdev->meta.rbuf.header.h80;
         struct asender_cmd *cmd = NULL;
   
         int rv, len;
         void *buf    = h;
         int received = 0;
- -      int expect   = sizeof(struct p_header);
+ +      int expect   = sizeof(struct p_header80);
         int empty;
   
         sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
@@@ -4593,8 -4468,10 +4593,8 @@@
                 while (1) {
                         clear_bit(SIGNAL_ASENDER, &mdev->flags);
                         flush_signals(current);
- -                      if (!drbd_process_done_ee(mdev)) {
- -                              dev_err(DEV, "process_done_ee() = NOT_OK\n");
+ +                      if (!drbd_process_done_ee(mdev))
                                 goto reconnect;
- -                      }
                         /* to avoid race with newly queued ACKs */
                         set_bit(SIGNAL_ASENDER, &mdev->flags);
                         spin_lock_irq(&mdev->req_lock);
@@@ -4653,23 -4530,21 +4653,23 @@@
   
                 if (received == expect && cmd == NULL) {
                         if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- -                              dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
- -                                  (long)be32_to_cpu(h->magic),
- -                                  h->command, h->length);
+ +                              dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
+ +                                  be32_to_cpu(h->magic),
+ +                                  be16_to_cpu(h->command),
+ +                                  be16_to_cpu(h->length));
                                 goto reconnect;
                         }
                         cmd = get_asender_cmd(be16_to_cpu(h->command));
                         len = be16_to_cpu(h->length);
                         if (unlikely(cmd == NULL)) {
- -                              dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
- -                                  (long)be32_to_cpu(h->magic),
- -                                  h->command, h->length);
+ +                              dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
+ +                                  be32_to_cpu(h->magic),
+ +                                  be16_to_cpu(h->command),
+ +                                  be16_to_cpu(h->length));
                                 goto disconnect;
                         }
                         expect = cmd->pkt_size;
- -                      ERR_IF(len != expect-sizeof(struct p_header))
+ +                      ERR_IF(len != expect-sizeof(struct p_header80))
                                 goto reconnect;
                 }
                 if (received == expect) {
@@@ -4679,7 -4554,7 +4679,7 @@@
   
                         buf      = h;
                         received = 0;
- -                      expect   = sizeof(struct p_header);
+ +                      expect   = sizeof(struct p_header80);
                         cmd      = NULL;
                 }
         }
@@@ -4687,12 -4562,10 +4687,12 @@@
         if (0) {
   reconnect:
                 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+ +              drbd_md_sync(mdev);
         }
         if (0) {
   disconnect:
                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ +              drbd_md_sync(mdev);
         }
         clear_bit(SIGNAL_ASENDER, &mdev->flags);
   
diff --combined drivers/block/loop.c

index de3083b0a4f5de1f246ffdfa469fcb31840e35ee,5d27bc6596de01f179251b951ec292cfd181a917..6c48b3545f84583d0e32f1e87308c39a42b2133c
--- 1/drivers/block/loop.c
--- 2/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@@ -67,18 -67,16 +67,18 @@@
   #include <linux/compat.h>
   #include <linux/suspend.h>
   #include <linux/freezer.h>
- -#include <linux/smp_lock.h>
+ +#include <linux/mutex.h>
   #include <linux/writeback.h>
   #include <linux/buffer_head.h>                /* for invalidate_bdev() */
   #include <linux/completion.h>
   #include <linux/highmem.h>
   #include <linux/kthread.h>
   #include <linux/splice.h>
+ +#include <linux/sysfs.h>
   
   #include <asm/uaccess.h>
   
+ +static DEFINE_MUTEX(loop_mutex);
   static LIST_HEAD(loop_devices);
   static DEFINE_MUTEX(loop_devices_mutex);
   
@@@ -479,17 -477,17 +479,17 @@@ static int do_bio_filebacked(struct loo
         pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
   
         if (bio_rw(bio) == WRITE) {
-               bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
                 struct file *file = lo->lo_backing_file;
   
-               if (barrier) {
-                       if (unlikely(!file->f_op->fsync)) {
-                               ret = -EOPNOTSUPP;
-                               goto out;
-                       }
+               /* REQ_HARDBARRIER is deprecated */
+               if (bio->bi_rw & REQ_HARDBARRIER) {
+                       ret = -EOPNOTSUPP;
+                       goto out;
+               }
   
+               if (bio->bi_rw & REQ_FLUSH) {
                         ret = vfs_fsync(file, 0);
-                       if (unlikely(ret)) {
+                       if (unlikely(ret && ret != -EINVAL)) {
                                 ret = -EIO;
                                 goto out;
                         }
@@@ -497,9 -495,9 +497,9 @@@
   
                 ret = lo_send(lo, bio, pos);
   
-               if (barrier && !ret) {
+               if ((bio->bi_rw & REQ_FUA) && !ret) {
                         ret = vfs_fsync(file, 0);
-                       if (unlikely(ret))
+                       if (unlikely(ret && ret != -EINVAL))
                                 ret = -EIO;
                 }
         } else
@@@ -739,103 -737,6 +739,103 @@@ static inline int is_loop_device(struc
         return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
   }
   
+ +/* loop sysfs attributes */
+ +
+ +static ssize_t loop_attr_show(struct device *dev, char *page,
+ +                            ssize_t (*callback)(struct loop_device *, char *))
+ +{
+ +      struct loop_device *l, *lo = NULL;
+ +
+ +      mutex_lock(&loop_devices_mutex);
+ +      list_for_each_entry(l, &loop_devices, lo_list)
+ +              if (disk_to_dev(l->lo_disk) == dev) {
+ +                      lo = l;
+ +                      break;
+ +              }
+ +      mutex_unlock(&loop_devices_mutex);
+ +
+ +      return lo ? callback(lo, page) : -EIO;
+ +}
+ +
+ +#define LOOP_ATTR_RO(_name)                                           \
+ +static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);        \
+ +static ssize_t loop_attr_do_show_##_name(struct device *d,            \
+ +                              struct device_attribute *attr, char *b) \
+ +{                                                                     \
+ +      return loop_attr_show(d, b, loop_attr_##_name##_show);          \
+ +}                                                                     \
+ +static struct device_attribute loop_attr_##_name =                    \
+ +      __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+ +
+ +static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+ +{
+ +      ssize_t ret;
+ +      char *p = NULL;
+ +
+ +      mutex_lock(&lo->lo_ctl_mutex);
+ +      if (lo->lo_backing_file)
+ +              p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+ +      mutex_unlock(&lo->lo_ctl_mutex);
+ +
+ +      if (IS_ERR_OR_NULL(p))
+ +              ret = PTR_ERR(p);
+ +      else {
+ +              ret = strlen(p);
+ +              memmove(buf, p, ret);
+ +              buf[ret++] = '\n';
+ +              buf[ret] = 0;
+ +      }
+ +
+ +      return ret;
+ +}
+ +
+ +static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+ +{
+ +      return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+ +}
+ +
+ +static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+ +{
+ +      return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+ +}
+ +
+ +static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+ +{
+ +      int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+ +
+ +      return sprintf(buf, "%s\n", autoclear ? "1" : "0");
+ +}
+ +
+ +LOOP_ATTR_RO(backing_file);
+ +LOOP_ATTR_RO(offset);
+ +LOOP_ATTR_RO(sizelimit);
+ +LOOP_ATTR_RO(autoclear);
+ +
+ +static struct attribute *loop_attrs[] = {
+ +      &loop_attr_backing_file.attr,
+ +      &loop_attr_offset.attr,
+ +      &loop_attr_sizelimit.attr,
+ +      &loop_attr_autoclear.attr,
+ +      NULL,
+ +};
+ +
+ +static struct attribute_group loop_attribute_group = {
+ +      .name = "loop",
+ +      .attrs= loop_attrs,
+ +};
+ +
+ +static int loop_sysfs_init(struct loop_device *lo)
+ +{
+ +      return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+ +                                &loop_attribute_group);
+ +}
+ +
+ +static void loop_sysfs_exit(struct loop_device *lo)
+ +{
+ +      sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+ +                         &loop_attribute_group);
+ +}
+ +
   static int loop_set_fd(struct loop_device *lo, fmode_t mode,
                        struct block_device *bdev, unsigned int arg)
   {
@@@ -931,11 -832,10 +931,11 @@@
         lo->lo_queue->unplug_fn = loop_unplug;
   
         if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-               blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+               blk_queue_flush(lo->lo_queue, REQ_FLUSH);
   
         set_capacity(lo->lo_disk, size);
         bd_set_size(bdev, size << 9);
+ +      loop_sysfs_init(lo);
         /* let user-space know about the new size */
         kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
   
@@@ -954,7 -854,6 +954,7 @@@
         return 0;
   
   out_clr:
+ +      loop_sysfs_exit(lo);
         lo->lo_thread = NULL;
         lo->lo_device = NULL;
         lo->lo_backing_file = NULL;
@@@ -1051,7 -950,6 +1051,7 @@@ static int loop_clr_fd(struct loop_devi
         set_capacity(lo->lo_disk, 0);
         if (bdev) {
                 bd_set_size(bdev, 0);
+ +              loop_sysfs_exit(lo);
                 /* let user-space know about this change */
                 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
         }
@@@ -1511,11 -1409,11 +1511,11 @@@ static int lo_open(struct block_device 
   {
         struct loop_device *lo = bdev->bd_disk->private_data;
   
- -      lock_kernel();
+ +      mutex_lock(&loop_mutex);
         mutex_lock(&lo->lo_ctl_mutex);
         lo->lo_refcnt++;
         mutex_unlock(&lo->lo_ctl_mutex);
- -      unlock_kernel();
+ +      mutex_unlock(&loop_mutex);
   
         return 0;
   }
@@@ -1525,7 -1423,7 +1525,7 @@@ static int lo_release(struct gendisk *d
         struct loop_device *lo = disk->private_data;
         int err;
   
- -      lock_kernel();
+ +      mutex_lock(&loop_mutex);
         mutex_lock(&lo->lo_ctl_mutex);
   
         if (--lo->lo_refcnt)
@@@ -1550,7 -1448,7 +1550,7 @@@
   out:
         mutex_unlock(&lo->lo_ctl_mutex);
   out_unlocked:
- -      lock_kernel();
+ +      mutex_unlock(&loop_mutex);
         return 0;
   }
   
diff --combined drivers/block/pktcdvd.c

index ef58fccadad3ada40ad07f6891b2cd3de0234c17,1b5cfcccd6543d5c445d6aa5f224f554ec701cd3..19b3568e9326303543c8ace464361637aadc6ef8
--- 1/drivers/block/pktcdvd.c
--- 2/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@@ -57,6 -57,7 +57,6 @@@
   #include <linux/seq_file.h>
   #include <linux/miscdevice.h>
   #include <linux/freezer.h>
- -#include <linux/smp_lock.h>
   #include <linux/mutex.h>
   #include <linux/slab.h>
   #include <scsi/scsi_cmnd.h>
@@@ -85,7 -86,6 +85,7 @@@
   
   #define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
   
+ +static DEFINE_MUTEX(pktcdvd_mutex);
   static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
   static struct proc_dir_entry *pkt_proc;
   static int pktdev_major;
@@@ -753,7 -753,6 +753,6 @@@ static int pkt_generic_packet(struct pk
   
         rq->timeout = 60*HZ;
         rq->cmd_type = REQ_TYPE_BLOCK_PC;
-       rq->cmd_flags |= REQ_HARDBARRIER;
         if (cgc->quiet)
                 rq->cmd_flags |= REQ_QUIET;
   
@@@ -2383,7 -2382,7 +2382,7 @@@ static int pkt_open(struct block_devic
   
         VPRINTK(DRIVER_NAME": entering open\n");
   
- -      lock_kernel();
+ +      mutex_lock(&pktcdvd_mutex);
         mutex_lock(&ctl_mutex);
         pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
         if (!pd) {
@@@ -2411,7 -2410,7 +2410,7 @@@
         }
   
         mutex_unlock(&ctl_mutex);
- -      unlock_kernel();
+ +      mutex_unlock(&pktcdvd_mutex);
         return 0;
   
   out_dec:
@@@ -2419,7 -2418,7 +2418,7 @@@
   out:
         VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
         mutex_unlock(&ctl_mutex);
- -      unlock_kernel();
+ +      mutex_unlock(&pktcdvd_mutex);
         return ret;
   }
   
@@@ -2428,7 -2427,7 +2427,7 @@@ static int pkt_close(struct gendisk *di
         struct pktcdvd_device *pd = disk->private_data;
         int ret = 0;
   
- -      lock_kernel();
+ +      mutex_lock(&pktcdvd_mutex);
         mutex_lock(&ctl_mutex);
         pd->refcnt--;
         BUG_ON(pd->refcnt < 0);
@@@ -2437,7 -2436,7 +2436,7 @@@
                 pkt_release_dev(pd, flush);
         }
         mutex_unlock(&ctl_mutex);
- -      unlock_kernel();
+ +      mutex_unlock(&pktcdvd_mutex);
         return ret;
   }
   
@@@ -2773,7 -2772,7 +2772,7 @@@ static int pkt_ioctl(struct block_devic
         VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
                 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
   
- -      lock_kernel();
+ +      mutex_lock(&pktcdvd_mutex);
         switch (cmd) {
         case CDROMEJECT:
                 /*
@@@ -2798,7 -2797,7 +2797,7 @@@
                 VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
                 ret = -ENOTTY;
         }
- -      unlock_kernel();
+ +      mutex_unlock(&pktcdvd_mutex);
   
         return ret;
   }
@@@ -3046,7 -3045,6 +3045,7 @@@ static const struct file_operations pkt
         .compat_ioctl   = pkt_ctl_compat_ioctl,
   #endif
         .owner          = THIS_MODULE,
+ +      .llseek         = no_llseek,
   };
   
   static struct miscdevice pkt_misc = {
diff --combined drivers/block/ps3disk.c

index 03688c2da319c007f4923c4ffd989e4f9666b755,4911f9e57bc70e11e782460bd9604721e4f80810..8e1ce2e2916a72cdeff49a8fa46f2179d04691dd
--- 1/drivers/block/ps3disk.c
--- 2/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@@ -113,7 -113,7 +113,7 @@@ static void ps3disk_scatter_gather(stru
                         memcpy(buf, dev->bounce_buf+offset, size);
                 offset += size;
                 flush_kernel_dcache_page(bvec->bv_page);
- -              bvec_kunmap_irq(bvec, &flags);
+ +              bvec_kunmap_irq(buf, &flags);
                 i++;
         }
   }
@@@ -468,7 -468,7 +468,7 @@@ static int __devinit ps3disk_probe(stru
         blk_queue_dma_alignment(queue, dev->blk_size-1);
         blk_queue_logical_block_size(queue, dev->blk_size);
   
-       blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+       blk_queue_flush(queue, REQ_FLUSH);
   
         blk_queue_max_segments(queue, -1);
         blk_queue_max_segment_size(queue, dev->bounce_size);
diff --combined drivers/block/virtio_blk.c

index 8320490226b78145f95c7e7801a15080419adc19,831e75caea3d07f7c0269618f5238b3f2aaf4cab..6ecf89cdf006a3f6605d15fb062f134a2d3a0621
--- 1/drivers/block/virtio_blk.c
--- 2/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@@ -2,6 -2,7 +2,6 @@@
   #include <linux/spinlock.h>
   #include <linux/slab.h>
   #include <linux/blkdev.h>
- -#include <linux/smp_lock.h>
   #include <linux/hdreg.h>
   #include <linux/virtio.h>
   #include <linux/virtio_blk.h>
@@@ -127,9 -128,6 +127,6 @@@ static bool do_req(struct request_queu
                 }
         }
   
-       if (vbr->req->cmd_flags & REQ_HARDBARRIER)
-               vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
- 
         sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
   
         /*
@@@ -221,8 -219,8 +218,8 @@@ static int virtblk_get_id(struct gendis
         return err;
   }
   
- -static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
- -                       unsigned cmd, unsigned long data)
+ +static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ +                           unsigned int cmd, unsigned long data)
   {
         struct gendisk *disk = bdev->bd_disk;
         struct virtio_blk *vblk = disk->private_data;
@@@ -237,6 -235,18 +234,6 @@@
                               (void __user *)data);
   }
   
- -static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
- -                           unsigned int cmd, unsigned long param)
- -{
- -      int ret;
- -
- -      lock_kernel();
- -      ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
- -      unlock_kernel();
- -
- -      return ret;
- -}
- -
   /* We provide getgeo only to please some old bootloader/partitioning tools */
   static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
   {
@@@ -379,31 -389,9 +376,9 @@@ static int __devinit virtblk_probe(stru
         vblk->disk->driverfs_dev = &vdev->dev;
         index++;
   
-       if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
-               /*
-                * If the FLUSH feature is supported we do have support for
-                * flushing a volatile write cache on the host.  Use that
-                * to implement write barrier support.
-                */
-               blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
-       } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
-               /*
-                * If the BARRIER feature is supported the host expects us
-                * to order request by tags.  This implies there is not
-                * volatile write cache on the host, and that the host
-                * never re-orders outstanding I/O.  This feature is not
-                * useful for real life scenarious and deprecated.
-                */
-               blk_queue_ordered(q, QUEUE_ORDERED_TAG);
-       } else {
-               /*
-                * If the FLUSH feature is not supported we must assume that
-                * the host does not perform any kind of volatile write
-                * caching. We still need to drain the queue to provider
-                * proper barrier semantics.
-                */
-               blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
-       }
+       /* configure queue flush support */
+       if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+               blk_queue_flush(q, REQ_FLUSH);
   
         /* If disk is read-only in the host, the guest should obey */
         if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@@ -522,9 -510,9 +497,9 @@@ static const struct virtio_device_id id
   };
   
   static unsigned int features[] = {
-       VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
-       VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-       VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+       VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+       VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+       VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
   };
   
   /*
diff --combined drivers/block/xen-blkfront.c

index 3ff06f475eef47bc841b5c0877527ce9d47ea32b,739b4c1416eab3b4aac7cc2389bec131b4a09752..4b33a18c32e0c91959442deab466741b7e245ddf
--- 1/drivers/block/xen-blkfront.c
--- 2/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@@ -41,7 -41,7 +41,7 @@@
   #include <linux/cdrom.h>
   #include <linux/module.h>
   #include <linux/slab.h>
- -#include <linux/smp_lock.h>
+ +#include <linux/mutex.h>
   #include <linux/scatterlist.h>
   
   #include <xen/xen.h>
@@@ -69,7 -69,6 +69,7 @@@ struct blk_shadow 
         unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   };
   
+ +static DEFINE_MUTEX(blkfront_mutex);
   static const struct block_device_operations xlvbd_block_fops;
   
   #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
@@@ -96,7 -95,7 +96,7 @@@ struct blkfront_inf
         struct gnttab_free_callback callback;
         struct blk_shadow shadow[BLK_RING_SIZE];
         unsigned long shadow_free;
-       int feature_barrier;
+       unsigned int feature_flush;
         int is_ready;
   };
   
@@@ -419,26 -418,12 +419,12 @@@ static int xlvbd_init_blk_queue(struct 
   }
   
   
- static int xlvbd_barrier(struct blkfront_info *info)
+ static void xlvbd_flush(struct blkfront_info *info)
   {
-       int err;
-       const char *barrier;
- 
-       switch (info->feature_barrier) {
-       case QUEUE_ORDERED_DRAIN:       barrier = "enabled (drain)"; break;
-       case QUEUE_ORDERED_TAG:         barrier = "enabled (tag)"; break;
-       case QUEUE_ORDERED_NONE:        barrier = "disabled"; break;
-       default:                        return -EINVAL;
-       }
- 
-       err = blk_queue_ordered(info->rq, info->feature_barrier);
- 
-       if (err)
-               return err;
- 
+       blk_queue_flush(info->rq, info->feature_flush);
         printk(KERN_INFO "blkfront: %s: barriers %s\n",
-              info->gd->disk_name, barrier);
-       return 0;
+              info->gd->disk_name,
+              info->feature_flush ? "enabled" : "disabled");
   }
   
   
@@@ -517,7 -502,7 +503,7 @@@ static int xlvbd_alloc_gendisk(blkif_se
         info->rq = gd->queue;
         info->gd = gd;
   
-       xlvbd_barrier(info);
+       xlvbd_flush(info);
   
         if (vdisk_info & VDISK_READONLY)
                 set_disk_ro(gd, 1);
@@@ -663,8 -648,8 +649,8 @@@ static irqreturn_t blkif_interrupt(int 
                                 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
                                        info->gd->disk_name);
                                 error = -EOPNOTSUPP;
-                               info->feature_barrier = QUEUE_ORDERED_NONE;
-                               xlvbd_barrier(info);
+                               info->feature_flush = 0;
+                               xlvbd_flush(info);
                         }
                         /* fall through */
                 case BLKIF_OP_READ:
@@@ -1077,20 -1062,20 +1063,20 @@@ static void blkfront_connect(struct blk
         /*
          * If there's no "feature-barrier" defined, then it means
          * we're dealing with a very old backend which writes
-        * synchronously; draining will do what needs to get done.
+        * synchronously; nothing to do.
          *
-        * If there are barriers, then we can do full queued writes
-        * with tagged barriers.
-        *
-        * If barriers are not supported, then there's no much we can
-        * do, so just set ordering to NONE.
+        * If there are barriers, then we use flush.
          */
-       if (err)
-               info->feature_barrier = QUEUE_ORDERED_DRAIN;
-       else if (barrier)
-               info->feature_barrier = QUEUE_ORDERED_TAG;
-       else
-               info->feature_barrier = QUEUE_ORDERED_NONE;
+       info->feature_flush = 0;
+ 
+       /*
+        * The driver doesn't properly handled empty flushes, so
+        * lets disable barrier support for now.
+        */
+ #if 0
+       if (!err && barrier)
+               info->feature_flush = REQ_FLUSH;
+ #endif
   
         err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
         if (err) {
@@@ -1202,7 -1187,7 +1188,7 @@@ static int blkif_open(struct block_devi
         struct blkfront_info *info;
         int err = 0;
   
- -      lock_kernel();
+ +      mutex_lock(&blkfront_mutex);
   
         info = disk->private_data;
         if (!info) {
@@@ -1220,7 -1205,7 +1206,7 @@@
         mutex_unlock(&info->mutex);
   
   out:
- -      unlock_kernel();
+ +      mutex_unlock(&blkfront_mutex);
         return err;
   }
   
@@@ -1230,7 -1215,7 +1216,7 @@@ static int blkif_release(struct gendis
         struct block_device *bdev;
         struct xenbus_device *xbdev;
   
- -      lock_kernel();
+ +      mutex_lock(&blkfront_mutex);
   
         bdev = bdget_disk(disk, 0);
         bdput(bdev);
@@@ -1264,7 -1249,7 +1250,7 @@@
         }
   
   out:
- -      unlock_kernel();
+ +      mutex_unlock(&blkfront_mutex);
         return 0;
   }
   
diff --combined drivers/md/dm-snap.c

index f30f6e8d594e1cc90b52d6d73a4b110367ae2d6f,eed210152b75802930369d1953d6006aaedc993a..53cf79d8bcbc5aa24c7e004d588aff0e2b4676bc
--- 1/drivers/md/dm-snap.c
--- 2/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@@ -706,6 -706,8 +706,6 @@@ static int dm_add_exception(void *conte
         return 0;
   }
   
- -#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
- -
   /*
    * Return a minimum chunk size of all snapshots that have the specified origin.
    * Return zero if the origin has no snapshots.
@@@ -1585,7 -1587,7 +1585,7 @@@ static int snapshot_map(struct dm_targe
         chunk_t chunk;
         struct dm_snap_pending_exception *pe = NULL;
   
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                 bio->bi_bdev = s->cow->bdev;
                 return DM_MAPIO_REMAPPED;
         }
@@@ -1689,7 -1691,7 +1689,7 @@@ static int snapshot_merge_map(struct dm
         int r = DM_MAPIO_REMAPPED;
         chunk_t chunk;
   
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                 if (!map_context->target_request_nr)
                         bio->bi_bdev = s->origin->bdev;
                 else
@@@ -2133,7 -2135,7 +2133,7 @@@ static int origin_map(struct dm_target 
         struct dm_dev *dev = ti->private;
         bio->bi_bdev = dev->bdev;
   
-       if (unlikely(bio_empty_barrier(bio)))
+       if (bio->bi_rw & REQ_FLUSH)
                 return DM_MAPIO_REMAPPED;
   
         /* Only tell snapshots if this is a write */
diff --combined drivers/md/dm.c

index 7967eca5a2d5fdaf3a009dde0eee189437103dc9,f934e9878436300252483632b31693bb6e497ea8..7cb1352f7e7a5e2b4b5e400319b981406ec69005
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -15,6 -15,7 +15,6 @@@
   #include <linux/blkpg.h>
   #include <linux/bio.h>
   #include <linux/buffer_head.h>
- -#include <linux/smp_lock.h>
   #include <linux/mempool.h>
   #include <linux/slab.h>
   #include <linux/idr.h>
@@@ -32,7 -33,6 +32,7 @@@
   #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
   #define DM_COOKIE_LENGTH 24
   
+ +static DEFINE_MUTEX(dm_mutex);
   static const char *_name = DM_NAME;
   
   static unsigned int major = 0;
@@@ -110,7 -110,6 +110,6 @@@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo)
   #define DMF_FREEING 3
   #define DMF_DELETING 4
   #define DMF_NOFLUSH_SUSPENDING 5
- #define DMF_QUEUE_IO_TO_THREAD 6
   
   /*
    * Work processed by per-device workqueue.
@@@ -144,24 -143,9 +143,9 @@@ struct mapped_device 
         spinlock_t deferred_lock;
   
         /*
-        * An error from the barrier request currently being processed.
-        */
-       int barrier_error;
- 
-       /*
-        * Protect barrier_error from concurrent endio processing
-        * in request-based dm.
-        */
-       spinlock_t barrier_error_lock;
- 
-       /*
-        * Processing queue (flush/barriers)
+        * Processing queue (flush)
          */
         struct workqueue_struct *wq;
-       struct work_struct barrier_work;
- 
-       /* A pointer to the currently processing pre/post flush request */
-       struct request *flush_request;
   
         /*
          * The current mapping.
@@@ -200,8 -184,8 +184,8 @@@
         /* sysfs handle */
         struct kobject kobj;
   
-       /* zero-length barrier that will be cloned and submitted to targets */
-       struct bio barrier_bio;
+       /* zero-length flush that will be cloned and submitted to targets */
+       struct bio flush_bio;
   };
   
   /*
@@@ -344,7 -328,7 +328,7 @@@ static int dm_blk_open(struct block_dev
   {
         struct mapped_device *md;
   
- -      lock_kernel();
+ +      mutex_lock(&dm_mutex);
         spin_lock(&_minor_lock);
   
         md = bdev->bd_disk->private_data;
@@@ -362,7 -346,7 +346,7 @@@
   
   out:
         spin_unlock(&_minor_lock);
- -      unlock_kernel();
+ +      mutex_unlock(&dm_mutex);
   
         return md ? 0 : -ENXIO;
   }
@@@ -371,10 -355,10 +355,10 @@@ static int dm_blk_close(struct gendisk 
   {
         struct mapped_device *md = disk->private_data;
   
- -      lock_kernel();
+ +      mutex_lock(&dm_mutex);
         atomic_dec(&md->open_count);
         dm_put(md);
- -      unlock_kernel();
+ +      mutex_unlock(&dm_mutex);
   
         return 0;
   }
@@@ -512,7 -496,7 +496,7 @@@ static void end_io_acct(struct dm_io *i
   
         /*
          * After this is decremented the bio must not be touched if it is
-        * a barrier.
+        * a flush.
          */
         dm_disk(md)->part0.in_flight[rw] = pending =
                 atomic_dec_return(&md->pending[rw]);
@@@ -528,16 -512,12 +512,12 @@@
    */
   static void queue_io(struct mapped_device *md, struct bio *bio)
   {
-       down_write(&md->io_lock);
+       unsigned long flags;
   
-       spin_lock_irq(&md->deferred_lock);
+       spin_lock_irqsave(&md->deferred_lock, flags);
         bio_list_add(&md->deferred, bio);
-       spin_unlock_irq(&md->deferred_lock);
- 
-       if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
-               queue_work(md->wq, &md->work);
- 
-       up_write(&md->io_lock);
+       spin_unlock_irqrestore(&md->deferred_lock, flags);
+       queue_work(md->wq, &md->work);
   }
   
   /*
@@@ -625,11 -605,9 +605,9 @@@ static void dec_pending(struct dm_io *i
                          * Target requested pushing back the I/O.
                          */
                         spin_lock_irqsave(&md->deferred_lock, flags);
-                       if (__noflush_suspending(md)) {
-                               if (!(io->bio->bi_rw & REQ_HARDBARRIER))
-                                       bio_list_add_head(&md->deferred,
-                                                         io->bio);
-                       } else
+                       if (__noflush_suspending(md))
+                               bio_list_add_head(&md->deferred, io->bio);
+                       else
                                 /* noflush suspend was interrupted. */
                                 io->error = -EIO;
                         spin_unlock_irqrestore(&md->deferred_lock, flags);
@@@ -637,32 -615,23 +615,23 @@@
   
                 io_error = io->error;
                 bio = io->bio;
+               end_io_acct(io);
+               free_io(md, io);
+ 
+               if (io_error == DM_ENDIO_REQUEUE)
+                       return;
   
-               if (bio->bi_rw & REQ_HARDBARRIER) {
+               if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
                         /*
-                        * There can be just one barrier request so we use
-                        * a per-device variable for error reporting.
-                        * Note that you can't touch the bio after end_io_acct
-                        *
-                        * We ignore -EOPNOTSUPP for empty flush reported by
-                        * underlying devices. We assume that if the device
-                        * doesn't support empty barriers, it doesn't need
-                        * cache flushing commands.
+                        * Preflush done for flush with data, reissue
+                        * without REQ_FLUSH.
                          */
-                       if (!md->barrier_error &&
-                           !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
-                               md->barrier_error = io_error;
-                       end_io_acct(io);
-                       free_io(md, io);
+                       bio->bi_rw &= ~REQ_FLUSH;
+                       queue_io(md, bio);
                 } else {
-                       end_io_acct(io);
-                       free_io(md, io);
- 
-                       if (io_error != DM_ENDIO_REQUEUE) {
-                               trace_block_bio_complete(md->queue, bio);
- 
-                               bio_endio(bio, io_error);
-                       }
+                       /* done with normal IO or empty flush */
+                       trace_block_bio_complete(md->queue, bio);
+                       bio_endio(bio, io_error);
                 }
         }
   }
@@@ -755,23 -724,6 +724,6 @@@ static void end_clone_bio(struct bio *c
         blk_update_request(tio->orig, 0, nr_bytes);
   }
   
- static void store_barrier_error(struct mapped_device *md, int error)
- {
-       unsigned long flags;
- 
-       spin_lock_irqsave(&md->barrier_error_lock, flags);
-       /*
-        * Basically, the first error is taken, but:
-        *   -EOPNOTSUPP supersedes any I/O error.
-        *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
-        */
-       if (!md->barrier_error || error == -EOPNOTSUPP ||
-           (md->barrier_error != -EOPNOTSUPP &&
-            error == DM_ENDIO_REQUEUE))
-               md->barrier_error = error;
-       spin_unlock_irqrestore(&md->barrier_error_lock, flags);
- }
- 
   /*
    * Don't touch any member of the md after calling this function because
    * the md may be freed in dm_put() at the end of this function.
@@@ -809,13 -761,11 +761,11 @@@ static void free_rq_clone(struct reques
   static void dm_end_request(struct request *clone, int error)
   {
         int rw = rq_data_dir(clone);
-       int run_queue = 1;
-       bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
         struct dm_rq_target_io *tio = clone->end_io_data;
         struct mapped_device *md = tio->md;
         struct request *rq = tio->orig;
   
-       if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+       if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                 rq->errors = clone->errors;
                 rq->resid_len = clone->resid_len;
   
@@@ -829,15 -779,8 +779,8 @@@
         }
   
         free_rq_clone(clone);
- 
-       if (unlikely(is_barrier)) {
-               if (unlikely(error))
-                       store_barrier_error(md, error);
-               run_queue = 0;
-       } else
-               blk_end_request_all(rq, error);
- 
-       rq_completed(md, rw, run_queue);
+       blk_end_request_all(rq, error);
+       rq_completed(md, rw, true);
   }
   
   static void dm_unprep_request(struct request *rq)
@@@ -862,16 -805,6 +805,6 @@@ void dm_requeue_unmapped_request(struc
         struct request_queue *q = rq->q;
         unsigned long flags;
   
-       if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-               /*
-                * Barrier clones share an original request.
-                * Leave it to dm_end_request(), which handles this special
-                * case.
-                */
-               dm_end_request(clone, DM_ENDIO_REQUEUE);
-               return;
-       }
- 
         dm_unprep_request(rq);
   
         spin_lock_irqsave(q->queue_lock, flags);
@@@ -961,19 -894,6 +894,6 @@@ static void dm_complete_request(struct 
         struct dm_rq_target_io *tio = clone->end_io_data;
         struct request *rq = tio->orig;
   
-       if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-               /*
-                * Barrier clones share an original request.  So can't use
-                * softirq_done with the original.
-                * Pass the clone to dm_done() directly in this special case.
-                * It is safe (even if clone->q->queue_lock is held here)
-                * because there is no I/O dispatching during the completion
-                * of barrier clone.
-                */
-               dm_done(clone, error, true);
-               return;
-       }
- 
         tio->error = error;
         rq->completion_data = clone;
         blk_complete_request(rq);
@@@ -990,17 -910,6 +910,6 @@@ void dm_kill_unmapped_request(struct re
         struct dm_rq_target_io *tio = clone->end_io_data;
         struct request *rq = tio->orig;
   
-       if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-               /*
-                * Barrier clones share an original request.
-                * Leave it to dm_end_request(), which handles this special
-                * case.
-                */
-               BUG_ON(error > 0);
-               dm_end_request(clone, error);
-               return;
-       }
- 
         rq->cmd_flags |= REQ_FAILED;
         dm_complete_request(clone, error);
   }
@@@ -1119,7 -1028,7 +1028,7 @@@ static void dm_bio_destructor(struct bi
   }
   
   /*
-  * Creates a little bio that is just does part of a bvec.
+  * Creates a little bio that just does part of a bvec.
    */
   static struct bio *split_bvec(struct bio *bio, sector_t sector,
                               unsigned short idx, unsigned int offset,
@@@ -1134,7 -1043,7 +1043,7 @@@
   
         clone->bi_sector = sector;
         clone->bi_bdev = bio->bi_bdev;
-       clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+       clone->bi_rw = bio->bi_rw;
         clone->bi_vcnt = 1;
         clone->bi_size = to_bytes(len);
         clone->bi_io_vec->bv_offset = offset;
@@@ -1161,7 -1070,6 +1070,6 @@@ static struct bio *clone_bio(struct bi
   
         clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
         __bio_clone(clone, bio);
-       clone->bi_rw &= ~REQ_HARDBARRIER;
         clone->bi_destructor = dm_bio_destructor;
         clone->bi_sector = sector;
         clone->bi_idx = idx;
@@@ -1225,16 -1133,15 +1133,15 @@@ static void __issue_target_requests(str
                 __issue_target_request(ci, ti, request_nr, len);
   }
   
- static int __clone_and_map_empty_barrier(struct clone_info *ci)
+ static int __clone_and_map_empty_flush(struct clone_info *ci)
   {
         unsigned target_nr = 0;
         struct dm_target *ti;
   
+       BUG_ON(bio_has_data(ci->bio));
         while ((ti = dm_table_get_target(ci->map, target_nr++)))
                 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
   
-       ci->sector_count = 0;
- 
         return 0;
   }
   
@@@ -1289,9 -1196,6 +1196,6 @@@ static int __clone_and_map(struct clone
         sector_t len = 0, max;
         struct dm_target_io *tio;
   
-       if (unlikely(bio_empty_barrier(bio)))
-               return __clone_and_map_empty_barrier(ci);
- 
         if (unlikely(bio->bi_rw & REQ_DISCARD))
                 return __clone_and_map_discard(ci);
   
@@@ -1383,16 -1287,11 +1287,11 @@@ static void __split_and_process_bio(str
   
         ci.map = dm_get_live_table(md);
         if (unlikely(!ci.map)) {
-               if (!(bio->bi_rw & REQ_HARDBARRIER))
-                       bio_io_error(bio);
-               else
-                       if (!md->barrier_error)
-                               md->barrier_error = -EIO;
+               bio_io_error(bio);
                 return;
         }
   
         ci.md = md;
-       ci.bio = bio;
         ci.io = alloc_io(md);
         ci.io->error = 0;
         atomic_set(&ci.io->io_count, 1);
@@@ -1400,14 -1299,20 +1299,20 @@@
         ci.io->md = md;
         spin_lock_init(&ci.io->endio_lock);
         ci.sector = bio->bi_sector;
-       ci.sector_count = bio_sectors(bio);
-       if (unlikely(bio_empty_barrier(bio)))
-               ci.sector_count = 1;
         ci.idx = bio->bi_idx;
   
         start_io_acct(ci.io);
-       while (ci.sector_count && !error)
-               error = __clone_and_map(&ci);
+       if (bio->bi_rw & REQ_FLUSH) {
+               ci.bio = &ci.md->flush_bio;
+               ci.sector_count = 0;
+               error = __clone_and_map_empty_flush(&ci);
+               /* dec_pending submits any data associated with flush */
+       } else {
+               ci.bio = bio;
+               ci.sector_count = bio_sectors(bio);
+               while (ci.sector_count && !error)
+                       error = __clone_and_map(&ci);
+       }
   
         /* drop the extra reference count */
         dec_pending(ci.io, error);
@@@ -1491,22 -1396,14 +1396,14 @@@ static int _dm_request(struct request_q
         part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
         part_stat_unlock();
   
-       /*
-        * If we're suspended or the thread is processing barriers
-        * we have to queue this io for later.
-        */
-       if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-           unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+       /* if we're suspended, we have to queue this io for later */
+       if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
                 up_read(&md->io_lock);
   
-               if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
-                   bio_rw(bio) == READA) {
+               if (bio_rw(bio) != READA)
+                       queue_io(md, bio);
+               else
                         bio_io_error(bio);
-                       return 0;
-               }
- 
-               queue_io(md, bio);
- 
                 return 0;
         }
   
@@@ -1537,14 -1434,6 +1434,6 @@@ static int dm_request(struct request_qu
         return _dm_request(q, bio);
   }
   
- static bool dm_rq_is_flush_request(struct request *rq)
- {
-       if (rq->cmd_flags & REQ_FLUSH)
-               return true;
-       else
-               return false;
- }
- 
   void dm_dispatch_request(struct request *rq)
   {
         int r;
@@@ -1592,22 -1481,15 +1481,15 @@@ static int setup_clone(struct request *
   {
         int r;
   
-       if (dm_rq_is_flush_request(rq)) {
-               blk_rq_init(NULL, clone);
-               clone->cmd_type = REQ_TYPE_FS;
-               clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
-       } else {
-               r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-                                     dm_rq_bio_constructor, tio);
-               if (r)
-                       return r;
- 
-               clone->cmd = rq->cmd;
-               clone->cmd_len = rq->cmd_len;
-               clone->sense = rq->sense;
-               clone->buffer = rq->buffer;
-       }
+       r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+                             dm_rq_bio_constructor, tio);
+       if (r)
+               return r;
   
+       clone->cmd = rq->cmd;
+       clone->cmd_len = rq->cmd_len;
+       clone->sense = rq->sense;
+       clone->buffer = rq->buffer;
         clone->end_io = end_clone_request;
         clone->end_io_data = tio;
   
@@@ -1648,9 -1530,6 +1530,6 @@@ static int dm_prep_fn(struct request_qu
         struct mapped_device *md = q->queuedata;
         struct request *clone;
   
-       if (unlikely(dm_rq_is_flush_request(rq)))
-               return BLKPREP_OK;
- 
         if (unlikely(rq->special)) {
                 DMWARN("Already has something in rq->special.");
                 return BLKPREP_KILL;
@@@ -1727,6 -1606,7 +1606,7 @@@ static void dm_request_fn(struct reques
         struct dm_table *map = dm_get_live_table(md);
         struct dm_target *ti;
         struct request *rq, *clone;
+       sector_t pos;
   
         /*
          * For suspend, check blk_queue_stopped() and increment
@@@ -1739,15 -1619,14 +1619,14 @@@
                 if (!rq)
                         goto plug_and_out;
   
-               if (unlikely(dm_rq_is_flush_request(rq))) {
-                       BUG_ON(md->flush_request);
-                       md->flush_request = rq;
-                       blk_start_request(rq);
-                       queue_work(md->wq, &md->barrier_work);
-                       goto out;
-               }
+               /* always use block 0 to find the target for flushes for now */
+               pos = 0;
+               if (!(rq->cmd_flags & REQ_FLUSH))
+                       pos = blk_rq_pos(rq);
+ 
+               ti = dm_table_find_target(map, pos);
+               BUG_ON(!dm_target_is_valid(ti));
   
-               ti = dm_table_find_target(map, blk_rq_pos(rq));
                 if (ti->type->busy && ti->type->busy(ti))
                         goto plug_and_out;
   
@@@ -1918,7 -1797,6 +1797,6 @@@ out
   static const struct block_device_operations dm_blk_dops;
   
   static void dm_wq_work(struct work_struct *work);
- static void dm_rq_barrier_work(struct work_struct *work);
   
   static void dm_init_md_queue(struct mapped_device *md)
   {
@@@ -1940,6 -1818,7 +1818,7 @@@
         blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
         md->queue->unplug_fn = dm_unplug_all;
         blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+       blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
   }
   
   /*
@@@ -1972,7 -1851,6 +1851,6 @@@ static struct mapped_device *alloc_dev(
         mutex_init(&md->suspend_lock);
         mutex_init(&md->type_lock);
         spin_lock_init(&md->deferred_lock);
-       spin_lock_init(&md->barrier_error_lock);
         rwlock_init(&md->map_lock);
         atomic_set(&md->holders, 1);
         atomic_set(&md->open_count, 0);
@@@ -1995,7 -1873,6 +1873,6 @@@
         atomic_set(&md->pending[1], 0);
         init_waitqueue_head(&md->wait);
         INIT_WORK(&md->work, dm_wq_work);
-       INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
         init_waitqueue_head(&md->eventq);
   
         md->disk->major = _major;
@@@ -2015,6 -1892,10 +1892,10 @@@
         if (!md->bdev)
                 goto bad_bdev;
   
+       bio_init(&md->flush_bio);
+       md->flush_bio.bi_bdev = md->bdev;
+       md->flush_bio.bi_rw = WRITE_FLUSH;
+ 
         /* Populate the mapping, nobody knows we exist yet */
         spin_lock(&_minor_lock);
         old_md = idr_replace(&_minor_idr, md, minor);
@@@ -2245,7 -2126,6 +2126,6 @@@ static int dm_init_request_based_queue(
         blk_queue_softirq_done(md->queue, dm_softirq_done);
         blk_queue_prep_rq(md->queue, dm_prep_fn);
         blk_queue_lld_busy(md->queue, dm_lld_busy);
-       blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
   
         elv_register_queue(md->queue);
   
@@@ -2406,43 -2286,6 +2286,6 @@@ static int dm_wait_for_completion(struc
         return r;
   }
   
- static void dm_flush(struct mapped_device *md)
- {
-       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
- 
-       bio_init(&md->barrier_bio);
-       md->barrier_bio.bi_bdev = md->bdev;
-       md->barrier_bio.bi_rw = WRITE_BARRIER;
-       __split_and_process_bio(md, &md->barrier_bio);
- 
-       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
- }
- 
- static void process_barrier(struct mapped_device *md, struct bio *bio)
- {
-       md->barrier_error = 0;
- 
-       dm_flush(md);
- 
-       if (!bio_empty_barrier(bio)) {
-               __split_and_process_bio(md, bio);
-               /*
-                * If the request isn't supported, don't waste time with
-                * the second flush.
-                */
-               if (md->barrier_error != -EOPNOTSUPP)
-                       dm_flush(md);
-       }
- 
-       if (md->barrier_error != DM_ENDIO_REQUEUE)
-               bio_endio(bio, md->barrier_error);
-       else {
-               spin_lock_irq(&md->deferred_lock);
-               bio_list_add_head(&md->deferred, bio);
-               spin_unlock_irq(&md->deferred_lock);
-       }
- }
- 
   /*
    * Process the deferred bios
    */
@@@ -2452,33 -2295,27 +2295,27 @@@ static void dm_wq_work(struct work_stru
                                                 work);
         struct bio *c;
   
-       down_write(&md->io_lock);
+       down_read(&md->io_lock);
   
         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
                 spin_lock_irq(&md->deferred_lock);
                 c = bio_list_pop(&md->deferred);
                 spin_unlock_irq(&md->deferred_lock);
   
-               if (!c) {
-                       clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+               if (!c)
                         break;
-               }
   
-               up_write(&md->io_lock);
+               up_read(&md->io_lock);
   
                 if (dm_request_based(md))
                         generic_make_request(c);
-               else {
-                       if (c->bi_rw & REQ_HARDBARRIER)
-                               process_barrier(md, c);
-                       else
-                               __split_and_process_bio(md, c);
-               }
+               else
+                       __split_and_process_bio(md, c);
   
-               down_write(&md->io_lock);
+               down_read(&md->io_lock);
         }
   
-       up_write(&md->io_lock);
+       up_read(&md->io_lock);
   }
   
   static void dm_queue_flush(struct mapped_device *md)
@@@ -2488,73 -2325,6 +2325,6 @@@
         queue_work(md->wq, &md->work);
   }
   
- static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
- {
-       struct dm_rq_target_io *tio = clone->end_io_data;
- 
-       tio->info.target_request_nr = request_nr;
- }
- 
- /* Issue barrier requests to targets and wait for their completion. */
- static int dm_rq_barrier(struct mapped_device *md)
- {
-       int i, j;
-       struct dm_table *map = dm_get_live_table(md);
-       unsigned num_targets = dm_table_get_num_targets(map);
-       struct dm_target *ti;
-       struct request *clone;
- 
-       md->barrier_error = 0;
- 
-       for (i = 0; i < num_targets; i++) {
-               ti = dm_table_get_target(map, i);
-               for (j = 0; j < ti->num_flush_requests; j++) {
-                       clone = clone_rq(md->flush_request, md, GFP_NOIO);
-                       dm_rq_set_target_request_nr(clone, j);
-                       atomic_inc(&md->pending[rq_data_dir(clone)]);
-                       map_request(ti, clone, md);
-               }
-       }
- 
-       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-       dm_table_put(map);
- 
-       return md->barrier_error;
- }
- 
- static void dm_rq_barrier_work(struct work_struct *work)
- {
-       int error;
-       struct mapped_device *md = container_of(work, struct mapped_device,
-                                               barrier_work);
-       struct request_queue *q = md->queue;
-       struct request *rq;
-       unsigned long flags;
- 
-       /*
-        * Hold the md reference here and leave it at the last part so that
-        * the md can't be deleted by device opener when the barrier request
-        * completes.
-        */
-       dm_get(md);
- 
-       error = dm_rq_barrier(md);
- 
-       rq = md->flush_request;
-       md->flush_request = NULL;
- 
-       if (error == DM_ENDIO_REQUEUE) {
-               spin_lock_irqsave(q->queue_lock, flags);
-               blk_requeue_request(q, rq);
-               spin_unlock_irqrestore(q->queue_lock, flags);
-       } else
-               blk_end_request_all(rq, error);
- 
-       blk_run_queue(q);
- 
-       dm_put(md);
- }
- 
   /*
    * Swap in a new table, returning the old one for the caller to destroy.
    */
@@@ -2677,23 -2447,17 +2447,17 @@@ int dm_suspend(struct mapped_device *md
          *
          * To get all processes out of __split_and_process_bio in dm_request,
          * we take the write lock. To prevent any process from reentering
-        * __split_and_process_bio from dm_request, we set
-        * DMF_QUEUE_IO_TO_THREAD.
-        *
-        * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
-        * and call flush_workqueue(md->wq). flush_workqueue will wait until
-        * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
-        * further calls to __split_and_process_bio from dm_wq_work.
+        * __split_and_process_bio from dm_request and quiesce the thread
+        * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+        * flush_workqueue(md->wq).
          */
         down_write(&md->io_lock);
         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-       set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
         up_write(&md->io_lock);
   
         /*
-        * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
-        * can be kicked until md->queue is stopped.  So stop md->queue before
-        * flushing md->wq.
+        * Stop md->queue before flushing md->wq in case request-based
+        * dm defers requests to md->wq from md->queue.
          */
         if (dm_request_based(md))
                 stop_queue(md->queue);
diff --combined drivers/md/md.c

index dbf822df942a70e7fb701d80936800dca40c2cbf,ed075d19db376493b16d415dc10568d68b4a85d8..225815197a3d69fba134433ab269d0c9a255681b
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -36,7 -36,7 +36,7 @@@
   #include <linux/blkdev.h>
   #include <linux/sysctl.h>
   #include <linux/seq_file.h>
- -#include <linux/smp_lock.h>
+ +#include <linux/mutex.h>
   #include <linux/buffer_head.h> /* for invalidate_bdev */
   #include <linux/poll.h>
   #include <linux/ctype.h>
@@@ -57,7 -57,6 +57,7 @@@
   #define DEBUG 0
   #define dprintk(x...) ((void)(DEBUG && printk(x)))
   
+ +static DEFINE_MUTEX(md_mutex);
   
   #ifndef MODULE
   static void autostart_arrays(int part);
@@@ -227,12 -226,12 +227,12 @@@ static int md_make_request(struct reque
                 return 0;
         }
         rcu_read_lock();
-       if (mddev->suspended || mddev->barrier) {
+       if (mddev->suspended) {
                 DEFINE_WAIT(__wait);
                 for (;;) {
                         prepare_to_wait(&mddev->sb_wait, &__wait,
                                         TASK_UNINTERRUPTIBLE);
-                       if (!mddev->suspended && !mddev->barrier)
+                       if (!mddev->suspended)
                                 break;
                         rcu_read_unlock();
                         schedule();
@@@ -283,40 -282,29 +283,29 @@@ EXPORT_SYMBOL_GPL(mddev_resume)
   
   int mddev_congested(mddev_t *mddev, int bits)
   {
-       if (mddev->barrier)
-               return 1;
         return mddev->suspended;
   }
   EXPORT_SYMBOL(mddev_congested);
   
   /*
-  * Generic barrier handling for md
+  * Generic flush handling for md
    */
   
- #define POST_REQUEST_BARRIER ((void*)1)
- 
- static void md_end_barrier(struct bio *bio, int err)
+ static void md_end_flush(struct bio *bio, int err)
   {
         mdk_rdev_t *rdev = bio->bi_private;
         mddev_t *mddev = rdev->mddev;
-       if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
-               set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
   
         rdev_dec_pending(rdev, mddev);
   
         if (atomic_dec_and_test(&mddev->flush_pending)) {
-               if (mddev->barrier == POST_REQUEST_BARRIER) {
-                       /* This was a post-request barrier */
-                       mddev->barrier = NULL;
-                       wake_up(&mddev->sb_wait);
-               } else
-                       /* The pre-request barrier has finished */
-                       schedule_work(&mddev->barrier_work);
+               /* The pre-request flush has finished */
+               schedule_work(&mddev->flush_work);
         }
         bio_put(bio);
   }
   
- static void submit_barriers(mddev_t *mddev)
+ static void submit_flushes(mddev_t *mddev)
   {
         mdk_rdev_t *rdev;
   
@@@ -333,60 -321,56 +322,56 @@@
                         atomic_inc(&rdev->nr_pending);
                         rcu_read_unlock();
                         bi = bio_alloc(GFP_KERNEL, 0);
-                       bi->bi_end_io = md_end_barrier;
+                       bi->bi_end_io = md_end_flush;
                         bi->bi_private = rdev;
                         bi->bi_bdev = rdev->bdev;
                         atomic_inc(&mddev->flush_pending);
-                       submit_bio(WRITE_BARRIER, bi);
+                       submit_bio(WRITE_FLUSH, bi);
                         rcu_read_lock();
                         rdev_dec_pending(rdev, mddev);
                 }
         rcu_read_unlock();
   }
   
- static void md_submit_barrier(struct work_struct *ws)
+ static void md_submit_flush_data(struct work_struct *ws)
   {
-       mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
-       struct bio *bio = mddev->barrier;
+       mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+       struct bio *bio = mddev->flush_bio;
   
         atomic_set(&mddev->flush_pending, 1);
   
-       if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-               bio_endio(bio, -EOPNOTSUPP);
-       else if (bio->bi_size == 0)
+       if (bio->bi_size == 0)
                 /* an empty barrier - all done */
                 bio_endio(bio, 0);
         else {
-               bio->bi_rw &= ~REQ_HARDBARRIER;
+               bio->bi_rw &= ~REQ_FLUSH;
                 if (mddev->pers->make_request(mddev, bio))
                         generic_make_request(bio);
-               mddev->barrier = POST_REQUEST_BARRIER;
-               submit_barriers(mddev);
         }
         if (atomic_dec_and_test(&mddev->flush_pending)) {
-               mddev->barrier = NULL;
+               mddev->flush_bio = NULL;
                 wake_up(&mddev->sb_wait);
         }
   }
   
- void md_barrier_request(mddev_t *mddev, struct bio *bio)
+ void md_flush_request(mddev_t *mddev, struct bio *bio)
   {
         spin_lock_irq(&mddev->write_lock);
         wait_event_lock_irq(mddev->sb_wait,
-                           !mddev->barrier,
+                           !mddev->flush_bio,
                             mddev->write_lock, /*nothing*/);
-       mddev->barrier = bio;
+       mddev->flush_bio = bio;
         spin_unlock_irq(&mddev->write_lock);
   
         atomic_set(&mddev->flush_pending, 1);
-       INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+       INIT_WORK(&mddev->flush_work, md_submit_flush_data);
   
-       submit_barriers(mddev);
+       submit_flushes(mddev);
   
         if (atomic_dec_and_test(&mddev->flush_pending))
-               schedule_work(&mddev->barrier_work);
+               schedule_work(&mddev->flush_work);
   }
- EXPORT_SYMBOL(md_barrier_request);
+ EXPORT_SYMBOL(md_flush_request);
   
   /* Support for plugging.
    * This mirrors the plugging support in request_queue, but does not
@@@ -697,31 -681,6 +682,6 @@@ static void super_written(struct bio *b
         bio_put(bio);
   }
   
- static void super_written_barrier(struct bio *bio, int error)
- {
-       struct bio *bio2 = bio->bi_private;
-       mdk_rdev_t *rdev = bio2->bi_private;
-       mddev_t *mddev = rdev->mddev;
- 
-       if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
-           error == -EOPNOTSUPP) {
-               unsigned long flags;
-               /* barriers don't appear to be supported :-( */
-               set_bit(BarriersNotsupp, &rdev->flags);
-               mddev->barriers_work = 0;
-               spin_lock_irqsave(&mddev->write_lock, flags);
-               bio2->bi_next = mddev->biolist;
-               mddev->biolist = bio2;
-               spin_unlock_irqrestore(&mddev->write_lock, flags);
-               wake_up(&mddev->sb_wait);
-               bio_put(bio);
-       } else {
-               bio_put(bio2);
-               bio->bi_private = rdev;
-               super_written(bio, error);
-       }
- }
- 
   void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                    sector_t sector, int size, struct page *page)
   {
@@@ -730,51 -689,28 +690,28 @@@
          * and decrement it on completion, waking up sb_wait
          * if zero is reached.
          * If an error occurred, call md_error
-        *
-        * As we might need to resubmit the request if REQ_HARDBARRIER
-        * causes ENOTSUPP, we allocate a spare bio...
          */
         struct bio *bio = bio_alloc(GFP_NOIO, 1);
-       int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
   
         bio->bi_bdev = rdev->bdev;
         bio->bi_sector = sector;
         bio_add_page(bio, page, size, 0);
         bio->bi_private = rdev;
         bio->bi_end_io = super_written;
-       bio->bi_rw = rw;
   
         atomic_inc(&mddev->pending_writes);
-       if (!test_bit(BarriersNotsupp, &rdev->flags)) {
-               struct bio *rbio;
-               rw |= REQ_HARDBARRIER;
-               rbio = bio_clone(bio, GFP_NOIO);
-               rbio->bi_private = bio;
-               rbio->bi_end_io = super_written_barrier;
-               submit_bio(rw, rbio);
-       } else
-               submit_bio(rw, bio);
+       submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+                  bio);
   }
   
   void md_super_wait(mddev_t *mddev)
   {
-       /* wait for all superblock writes that were scheduled to complete.
-        * if any had to be retried (due to BARRIER problems), retry them
-        */
+       /* wait for all superblock writes that were scheduled to complete */
         DEFINE_WAIT(wq);
         for(;;) {
                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
                 if (atomic_read(&mddev->pending_writes)==0)
                         break;
-               while (mddev->biolist) {
-                       struct bio *bio;
-                       spin_lock_irq(&mddev->write_lock);
-                       bio = mddev->biolist;
-                       mddev->biolist = bio->bi_next ;
-                       bio->bi_next = NULL;
-                       spin_unlock_irq(&mddev->write_lock);
-                       submit_bio(bio->bi_rw, bio);
-               }
                 schedule();
         }
         finish_wait(&mddev->sb_wait, &wq);
@@@ -1071,7 -1007,6 +1008,6 @@@ static int super_90_validate(mddev_t *m
         clear_bit(Faulty, &rdev->flags);
         clear_bit(In_sync, &rdev->flags);
         clear_bit(WriteMostly, &rdev->flags);
-       clear_bit(BarriersNotsupp, &rdev->flags);
   
         if (mddev->raid_disks == 0) {
                 mddev->major_version = 0;
@@@ -1486,7 -1421,6 +1422,6 @@@ static int super_1_validate(mddev_t *md
         clear_bit(Faulty, &rdev->flags);
         clear_bit(In_sync, &rdev->flags);
         clear_bit(WriteMostly, &rdev->flags);
-       clear_bit(BarriersNotsupp, &rdev->flags);
   
         if (mddev->raid_disks == 0) {
                 mddev->major_version = 1;
@@@ -4505,7 -4439,6 +4440,6 @@@ int md_run(mddev_t *mddev
         /* may be over-ridden by personality */
         mddev->resync_max_sectors = mddev->dev_sectors;
   
-       mddev->barriers_work = 1;
         mddev->ok_start_degraded = start_dirty_degraded;
   
         if (start_readonly && mddev->ro == 0)
@@@ -4684,7 -4617,6 +4618,6 @@@ static void md_clean(mddev_t *mddev
         mddev->recovery = 0;
         mddev->in_sync = 0;
         mddev->degraded = 0;
-       mddev->barriers_work = 0;
         mddev->safemode = 0;
         mddev->bitmap_info.offset = 0;
         mddev->bitmap_info.default_offset = 0;
@@@ -5952,7 -5884,7 +5885,7 @@@ static int md_open(struct block_device 
         mddev_t *mddev = mddev_find(bdev->bd_dev);
         int err;
   
- -      lock_kernel();
+ +      mutex_lock(&md_mutex);
         if (mddev->gendisk != bdev->bd_disk) {
                 /* we are racing with mddev_put which is discarding this
                  * bd_disk.
@@@ -5961,7 -5893,7 +5894,7 @@@
                 /* Wait until bdev->bd_disk is definitely gone */
                 flush_scheduled_work();
                 /* Then retry the open from the top */
- -              unlock_kernel();
+ +              mutex_unlock(&md_mutex);
                 return -ERESTARTSYS;
         }
         BUG_ON(mddev != bdev->bd_disk->private_data);
@@@ -5975,7 -5907,7 +5908,7 @@@
   
         check_disk_size_change(mddev->gendisk, bdev);
    out:
- -      unlock_kernel();
+ +      mutex_unlock(&md_mutex);
         return err;
   }
   
@@@ -5984,10 -5916,10 +5917,10 @@@ static int md_release(struct gendisk *d
         mddev_t *mddev = disk->private_data;
   
         BUG_ON(!mddev);
- -      lock_kernel();
+ +      mutex_lock(&md_mutex);
         atomic_dec(&mddev->openers);
         mddev_put(mddev);
- -      unlock_kernel();
+ +      mutex_unlock(&md_mutex);
   
         return 0;
   }
diff --combined drivers/s390/block/dasd.c

index 38e6fa9a2012fc40cdb25ab3a0089e916d9e5c8d,9b106d83b0cddd20095135d6062fbc824f0ff2cd..aa95f1001761534d187eb06ceab7597faa24f51d
--- 1/drivers/s390/block/dasd.c
--- 2/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@@ -21,6 -21,7 +21,6 @@@
   #include <linux/hdreg.h>
   #include <linux/async.h>
   #include <linux/mutex.h>
- -#include <linux/smp_lock.h>
   
   #include <asm/ccwdev.h>
   #include <asm/ebcdic.h>
@@@ -2196,7 -2197,6 +2196,6 @@@ static void dasd_setup_queue(struct das
          */
         blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
         blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
-       blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
   }
   
   /*
@@@ -2235,6 -2235,7 +2234,6 @@@ static int dasd_open(struct block_devic
         if (!block)
                 return -ENODEV;
   
- -      lock_kernel();
         base = block->base;
         atomic_inc(&block->open_count);
         if (test_bit(DASD_FLAG_OFFLINE, &base->flags)) {
@@@ -2269,12 -2270,14 +2268,12 @@@
                 goto out;
         }
   
- -      unlock_kernel();
         return 0;
   
   out:
         module_put(base->discipline->owner);
   unlock:
         atomic_dec(&block->open_count);
- -      unlock_kernel();
         return rc;
   }
   
@@@ -2282,8 -2285,10 +2281,8 @@@ static int dasd_release(struct gendisk 
   {
         struct dasd_block *block = disk->private_data;
   
- -      lock_kernel();
         atomic_dec(&block->open_count);
         module_put(block->base->discipline->owner);
- -      unlock_kernel();
         return 0;
   }
   
diff --combined fs/gfs2/rgrp.c

index fb67f593f40856b213f03887642c44a3e4c7ccc5,38b3ea1abaccd6ba9afe45720198d4e156171781..bef3ab6cf5c1aeb2d0f28d7955e4e62c8cb34be2
--- 1/fs/gfs2/rgrp.c
--- 2/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@@ -500,7 -500,7 +500,7 @@@ u64 gfs2_ri_total(struct gfs2_sbd *sdp
         for (rgrps = 0;; rgrps++) {
                 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
   
- -              if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+ +              if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
                         break;
                 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                            sizeof(struct gfs2_rindex));
@@@ -588,9 -588,7 +588,9 @@@ static int gfs2_ri_update(struct gfs2_i
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct inode *inode = &ip->i_inode;
         struct file_ra_state ra_state;
- -      u64 rgrp_count = ip->i_disksize;
+ +      u64 rgrp_count = i_size_read(inode);
+ +      struct gfs2_rgrpd *rgd;
+ +      unsigned int max_data = 0;
         int error;
   
         do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@@ -605,10 -603,6 +605,10 @@@
                 }
         }
   
+ +      list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+ +              if (rgd->rd_data > max_data)
+ +                      max_data = rgd->rd_data;
+ +      sdp->sd_max_rg_data = max_data;
         sdp->sd_rindex_uptodate = 1;
         return 0;
   }
@@@ -628,15 -622,13 +628,15 @@@ static int gfs2_ri_update_special(struc
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct inode *inode = &ip->i_inode;
         struct file_ra_state ra_state;
+ +      struct gfs2_rgrpd *rgd;
+ +      unsigned int max_data = 0;
         int error;
   
         file_ra_state_init(&ra_state, inode->i_mapping);
         for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
                 /* Ignore partials */
                 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
- -                  ip->i_disksize)
+ +                  i_size_read(inode))
                         break;
                 error = read_rindex_entry(ip, &ra_state);
                 if (error) {
@@@ -644,10 -636,6 +644,10 @@@
                         return error;
                 }
         }
+ +      list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+ +              if (rgd->rd_data > max_data)
+ +                      max_data = rgd->rd_data;
+ +      sdp->sd_max_rg_data = max_data;
   
         sdp->sd_rindex_uptodate = 1;
         return 0;
@@@ -866,8 -854,7 +866,7 @@@ static void gfs2_rgrp_send_discards(str
                                 if ((start + nr_sects) != blk) {
                                         rv = blkdev_issue_discard(bdev, start,
                                                             nr_sects, GFP_NOFS,
-                                                           BLKDEV_IFL_WAIT |
-                                                           BLKDEV_IFL_BARRIER);
+                                                           0);
                                         if (rv)
                                                 goto fail;
                                         nr_sects = 0;
@@@ -881,8 -868,7 +880,7 @@@ start_new_extent
                 }
         }
         if (nr_sects) {
-               rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-                                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+               rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
                 if (rv)
                         goto fail;
         }
@@@ -1200,8 -1186,7 +1198,8 @@@ out
    * Returns: errno
    */
   
- -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+ +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+ +                         char *file, unsigned int line)
   {
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct gfs2_alloc *al = ip->i_alloc;
@@@ -1212,15 -1197,12 +1210,15 @@@
                 return -EINVAL;
   
   try_again:
- -      /* We need to hold the rindex unless the inode we're using is
- -         the rindex itself, in which case it's already held. */
- -      if (ip != GFS2_I(sdp->sd_rindex))
- -              error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- -      else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
- -              error = gfs2_ri_update_special(ip);
+ +      if (hold_rindex) {
+ +              /* We need to hold the rindex unless the inode we're using is
+ +                 the rindex itself, in which case it's already held. */
+ +              if (ip != GFS2_I(sdp->sd_rindex))
+ +                      error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+ +              else if (!sdp->sd_rgrps) /* We may not have the rindex read
+ +                                          in, so: */
+ +                      error = gfs2_ri_update_special(ip);
+ +      }
   
         if (error)
                 return error;
@@@ -1231,7 -1213,7 +1229,7 @@@
            try to free it, and try the allocation again. */
         error = get_local_rgrp(ip, &unlinked, &last_unlinked);
         if (error) {
- -              if (ip != GFS2_I(sdp->sd_rindex))
+ +              if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                         gfs2_glock_dq_uninit(&al->al_ri_gh);
                 if (error != -EAGAIN)
                         return error;
@@@ -1273,7 -1255,7 +1271,7 @@@ void gfs2_inplace_release(struct gfs2_i
         al->al_rgd = NULL;
         if (al->al_rgd_gh.gh_gl)
                 gfs2_glock_dq_uninit(&al->al_rgd_gh);
- -      if (ip != GFS2_I(sdp->sd_rindex))
+ +      if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
                 gfs2_glock_dq_uninit(&al->al_ri_gh);
   }
   
@@@ -1512,19 -1494,11 +1510,19 @@@ int gfs2_alloc_block(struct gfs2_inode 
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct buffer_head *dibh;
         struct gfs2_alloc *al = ip->i_alloc;
- -      struct gfs2_rgrpd *rgd = al->al_rgd;
+ +      struct gfs2_rgrpd *rgd;
         u32 goal, blk;
         u64 block;
         int error;
   
+ +      /* Only happens if there is a bug in gfs2, return something distinctive
+ +       * to ensure that it is noticed.
+ +       */
+ +      if (al == NULL)
+ +              return -ECANCELED;
+ +
+ +      rgd = al->al_rgd;
+ +
         if (rgrp_contains_block(rgd, ip->i_goal))
                 goal = ip->i_goal - rgd->rd_data0;
         else
diff --combined fs/jbd/commit.c

index 3f030e9efea6abfc5a835361b935295b30b5ef0a,484c5e5fa8af50c0440452de0f8ad23d3dfddbc6..85a6883c0aca265b898431eb2a2254632a8a8ef7
--- 1/fs/jbd/commit.c
--- 2/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@@ -137,34 -137,10 +137,10 @@@ static int journal_write_commit_record(
         JBUFFER_TRACE(descriptor, "write commit block");
         set_buffer_dirty(bh);
   
-       if (journal->j_flags & JFS_BARRIER) {
-               ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
- 
-               /*
-                * Is it possible for another commit to fail at roughly
-                * the same time as this one?  If so, we don't want to
-                * trust the barrier flag in the super, but instead want
-                * to remember if we sent a barrier request
-                */
-               if (ret == -EOPNOTSUPP) {
-                       char b[BDEVNAME_SIZE];
- 
-                       printk(KERN_WARNING
-                               "JBD: barrier-based sync failed on %s - "
-                               "disabling barriers\n",
-                               bdevname(journal->j_dev, b));
-                       spin_lock(&journal->j_state_lock);
-                       journal->j_flags &= ~JFS_BARRIER;
-                       spin_unlock(&journal->j_state_lock);
- 
-                       /* And try again, without the barrier */
-                       set_buffer_uptodate(bh);
-                       set_buffer_dirty(bh);
-                       ret = sync_dirty_buffer(bh);
-               }
-       } else {
+       if (journal->j_flags & JFS_BARRIER)
+               ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+       else
                 ret = sync_dirty_buffer(bh);
-       }
   
         put_bh(bh);             /* One for getblk() */
         journal_put_journal_head(descriptor);
@@@ -318,7 -294,7 +294,7 @@@ void journal_commit_transaction(journal
         int first_tag = 0;
         int tag_flag;
         int i;
- -      int write_op = WRITE;
+ +      int write_op = WRITE_SYNC;
   
         /*
          * First job: lock down the current transaction and wait for
diff --combined fs/jbd2/commit.c

index 80910f51d4b447c8f7adc4d5eb26053f0e6ce75f,cb43c605cfaa247ec71e35c3d4ddf051b3a80963..bc6be8bda1cc067d3230acfbe20847b45906ee34
--- 1/fs/jbd2/commit.c
--- 2/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@@ -134,25 -134,11 +134,11 @@@ static int journal_submit_commit_record
   
         if (journal->j_flags & JBD2_BARRIER &&
             !JBD2_HAS_INCOMPAT_FEATURE(journal,
-                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-               ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
-               if (ret == -EOPNOTSUPP) {
-                       printk(KERN_WARNING
-                              "JBD2: Disabling barriers on %s, "
-                              "not supported by device\n", journal->j_devname);
-                       write_lock(&journal->j_state_lock);
-                       journal->j_flags &= ~JBD2_BARRIER;
-                       write_unlock(&journal->j_state_lock);
- 
-                       /* And try again, without the barrier */
-                       lock_buffer(bh);
-                       set_buffer_uptodate(bh);
-                       clear_buffer_dirty(bh);
-                       ret = submit_bh(WRITE_SYNC_PLUG, bh);
-               }
-       } else {
+                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+               ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+       else
                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
-       }
+ 
         *cbh = bh;
         return ret;
   }
@@@ -166,29 -152,8 +152,8 @@@ static int journal_wait_on_commit_recor
   {
         int ret = 0;
   
- retry:
         clear_buffer_dirty(bh);
         wait_on_buffer(bh);
-       if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-               printk(KERN_WARNING
-                      "JBD2: %s: disabling barries on %s - not supported "
-                      "by device\n", __func__, journal->j_devname);
-               write_lock(&journal->j_state_lock);
-               journal->j_flags &= ~JBD2_BARRIER;
-               write_unlock(&journal->j_state_lock);
- 
-               lock_buffer(bh);
-               clear_buffer_dirty(bh);
-               set_buffer_uptodate(bh);
-               bh->b_end_io = journal_end_buffer_io_sync;
- 
-               ret = submit_bh(WRITE_SYNC_PLUG, bh);
-               if (ret) {
-                       unlock_buffer(bh);
-                       return ret;
-               }
-               goto retry;
-       }
   
         if (unlikely(!buffer_uptodate(bh)))
                 ret = -EIO;
@@@ -360,7 -325,7 +325,7 @@@ void jbd2_journal_commit_transaction(jo
         int tag_bytes = journal_tag_bytes(journal);
         struct buffer_head *cbh = NULL; /* For transactional checksums */
         __u32 crc32_sum = ~0;
- -      int write_op = WRITE;
+ +      int write_op = WRITE_SYNC;
   
         /*
          * First job: lock down the current transaction and wait for
@@@ -701,6 -666,16 +666,16 @@@ start_journal_io
                 }
         }
   
+       err = journal_finish_inode_data_buffers(journal, commit_transaction);
+       if (err) {
+               printk(KERN_WARNING
+                       "JBD2: Detected IO errors while flushing file data "
+                      "on %s\n", journal->j_devname);
+               if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+                       jbd2_journal_abort(journal, err);
+               err = 0;
+       }
+ 
         /* 
          * If the journal is not located on the file system device,
          * then we must flush the file system device before we issue
@@@ -709,8 -684,7 +684,7 @@@
         if (commit_transaction->t_flushed_data_blocks &&
             (journal->j_fs_dev != journal->j_dev) &&
             (journal->j_flags & JBD2_BARRIER))
-               blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-                       BLKDEV_IFL_WAIT);
+               blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
   
         /* Done it all: now write the commit record asynchronously. */
         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@@ -719,19 -693,6 +693,6 @@@
                                                  &cbh, crc32_sum);
                 if (err)
                         __jbd2_journal_abort_hard(journal);
-               if (journal->j_flags & JBD2_BARRIER)
-                       blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-                               BLKDEV_IFL_WAIT);
-       }
- 
-       err = journal_finish_inode_data_buffers(journal, commit_transaction);
-       if (err) {
-               printk(KERN_WARNING
-                       "JBD2: Detected IO errors while flushing file data "
-                      "on %s\n", journal->j_devname);
-               if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
-                       jbd2_journal_abort(journal, err);
-               err = 0;
         }
   
         /* Lo and behold: we have just managed to send a transaction to
@@@ -845,6 -806,11 +806,11 @@@ wait_for_iobuf
         }
         if (!err && !is_journal_aborted(journal))
                 err = journal_wait_on_commit_record(journal, cbh);
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+           journal->j_flags & JBD2_BARRIER) {
+               blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+       }
   
         if (err)
                 jbd2_journal_abort(journal, err);
diff --combined fs/nilfs2/super.c

index 9f4913f78408a80e5c9b1253c19ac092adecb2b2,faa5078ff751c83a511e497fc1f4653b90bfd81d..f3b75206e9560888489856361cb9b0067167267e
--- 1/fs/nilfs2/super.c
--- 2/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@@ -45,6 -45,7 +45,6 @@@
   #include <linux/parser.h>
   #include <linux/random.h>
   #include <linux/crc32.h>
- -#include <linux/smp_lock.h>
   #include <linux/vfs.h>
   #include <linux/writeback.h>
   #include <linux/kobject.h>
@@@ -177,17 -178,9 +177,9 @@@ static int nilfs_sync_super(struct nilf
   
    retry:
         set_buffer_dirty(nilfs->ns_sbh[0]);
- 
         if (nilfs_test_opt(sbi, BARRIER)) {
                 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-                                         WRITE_SYNC | WRITE_BARRIER);
-               if (err == -EOPNOTSUPP) {
-                       nilfs_warning(sbi->s_super, __func__,
-                                     "barrier-based sync failed. "
-                                     "disabling barriers\n");
-                       nilfs_clear_opt(sbi, BARRIER);
-                       goto retry;
-               }
+                                         WRITE_SYNC | WRITE_FLUSH_FUA);
         } else {
                 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
         }
@@@ -341,6 -334,8 +333,6 @@@ static void nilfs_put_super(struct supe
         struct nilfs_sb_info *sbi = NILFS_SB(sb);
         struct the_nilfs *nilfs = sbi->s_nilfs;
   
- -      lock_kernel();
- -
         nilfs_detach_segment_constructor(sbi);
   
         if (!(sb->s_flags & MS_RDONLY)) {
@@@ -358,6 -353,8 +350,6 @@@
         sbi->s_super = NULL;
         sb->s_fs_info = NULL;
         nilfs_put_sbinfo(sbi);
- -
- -      unlock_kernel();
   }
   
   static int nilfs_sync_fs(struct super_block *sb, int wait)
@@@ -944,6 -941,8 +936,6 @@@ static int nilfs_remount(struct super_b
         struct nilfs_mount_options old_opts;
         int was_snapshot, err;
   
- -      lock_kernel();
- -
         down_write(&nilfs->ns_super_sem);
         old_sb_flags = sb->s_flags;
         old_opts.mount_opt = sbi->s_mount_opt;
@@@ -1017,6 -1016,7 +1009,6 @@@
         }
    out:
         up_write(&nilfs->ns_super_sem);
- -      unlock_kernel();
         return 0;
   
    restore_opts:
@@@ -1024,6 -1024,7 +1016,6 @@@
         sbi->s_mount_opt = old_opts.mount_opt;
         sbi->s_snapshot_cno = old_opts.snapshot_cno;
         up_write(&nilfs->ns_super_sem);
- -      unlock_kernel();
         return err;
   }
   
@@@ -1196,6 -1197,7 +1188,6 @@@ nilfs_get_sb(struct file_system_type *f
         put_nilfs(nilfs);
    failed:
         close_bdev_exclusive(sd.bdev, mode);
- -
         return err;
   
    cancel_new:
diff --combined include/linux/blk_types.h

index d36629620a4fb9143c0e37b0b1a16302afda0efc,36edadf5b41a658b54b3d22bbc4437f44f08203a..0437ab6bb54c0b2265c39bb961e88eda09d38739
--- 1/include/linux/blk_types.h
--- 2/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -97,7 -97,6 +97,7 @@@ struct bio 
   #define BIO_NULL_MAPPED 9     /* contains invalid user pages */
   #define BIO_FS_INTEGRITY 10   /* fs owns integrity data, not block layer */
   #define BIO_QUIET     11      /* Make BIO Quiet */
+ +#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
   #define bio_flagged(bio, flag)        ((bio)->bi_flags & (1 << (flag)))
   
   /*
@@@ -131,8 -130,6 +131,8 @@@ enum rq_flag_bits 
         /* bio only flags */
         __REQ_UNPLUG,           /* unplug the immediately after submission */
         __REQ_RAHEAD,           /* read ahead, can fail anytime */
+ +      __REQ_THROTTLED,        /* This bio has already been subjected to
+ +                               * throttling rules. Don't do it again. */
   
         /* request only flags */
         __REQ_SORTED,           /* elevator knows about this request */
@@@ -146,9 -143,9 +146,8 @@@
         __REQ_FAILED,           /* set if the request failed */
         __REQ_QUIET,            /* don't worry about errors */
         __REQ_PREEMPT,          /* set for "ide_preempt" requests */
-       __REQ_ORDERED_COLOR,    /* is before or after barrier */
         __REQ_ALLOCED,          /* request came from our alloc pool */
         __REQ_COPY_USER,        /* contains copies of user pages */
- -      __REQ_INTEGRITY,        /* integrity metadata has been remapped */
         __REQ_FLUSH,            /* request for cache flush */
         __REQ_IO_STAT,          /* account I/O stat */
         __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
@@@ -170,11 -167,11 +169,12 @@@
         (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
   #define REQ_COMMON_MASK \
         (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
-        REQ_META| REQ_DISCARD | REQ_NOIDLE)
+        REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+ #define REQ_CLONE_MASK                REQ_COMMON_MASK
   
   #define REQ_UNPLUG            (1 << __REQ_UNPLUG)
   #define REQ_RAHEAD            (1 << __REQ_RAHEAD)
+ +#define REQ_THROTTLED         (1 << __REQ_THROTTLED)
   
   #define REQ_SORTED            (1 << __REQ_SORTED)
   #define REQ_SOFTBARRIER               (1 << __REQ_SOFTBARRIER)
@@@ -187,9 -184,9 +187,8 @@@
   #define REQ_FAILED            (1 << __REQ_FAILED)
   #define REQ_QUIET             (1 << __REQ_QUIET)
   #define REQ_PREEMPT           (1 << __REQ_PREEMPT)
- #define REQ_ORDERED_COLOR     (1 << __REQ_ORDERED_COLOR)
   #define REQ_ALLOCED           (1 << __REQ_ALLOCED)
   #define REQ_COPY_USER         (1 << __REQ_COPY_USER)
- -#define REQ_INTEGRITY         (1 << __REQ_INTEGRITY)
   #define REQ_FLUSH             (1 << __REQ_FLUSH)
   #define REQ_IO_STAT           (1 << __REQ_IO_STAT)
   #define REQ_MIXED_MERGE               (1 << __REQ_MIXED_MERGE)
diff --combined include/linux/blkdev.h

index 16f7f1be1acf2d88569955dba40e11d9308cce4b,accbd0e5c89360fba02b2fb32001f2d7de372dba..009b80e49f5361bb119e346e184c796d698dfc42
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -115,7 -115,6 +115,7 @@@ struct request 
         void *elevator_private3;
   
         struct gendisk *rq_disk;
+ +      struct hd_struct *part;
         unsigned long start_time;
   #ifdef CONFIG_BLK_CGROUP
         unsigned long long start_time_ns;
@@@ -125,9 -124,6 +125,9 @@@
          * physical address coalescing is performed.
          */
         unsigned short nr_phys_segments;
+ +#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ +      unsigned short nr_integrity_segments;
+ +#endif
   
         unsigned short ioprio;
   
@@@ -247,7 -243,6 +247,7 @@@ struct queue_limits 
   
         unsigned short          logical_block_size;
         unsigned short          max_segments;
+ +      unsigned short          max_integrity_segments;
   
         unsigned char           misaligned;
         unsigned char           discard_misaligned;
@@@ -360,23 -355,20 +360,25 @@@ struct request_queu
         struct blk_trace        *blk_trace;
   #endif
         /*
-        * reserved for flush operations
+        * for flush operations
          */
-       unsigned int            ordered, next_ordered, ordseq;
-       int                     orderr, ordcolor;
-       struct request          pre_flush_rq, bar_rq, post_flush_rq;
-       struct request          *orig_bar_rq;
+       unsigned int            flush_flags;
+       unsigned int            flush_seq;
+       int                     flush_err;
+       struct request          flush_rq;
+       struct request          *orig_flush_rq;
+       struct list_head        pending_flushes;
   
         struct mutex            sysfs_lock;
   
   #if defined(CONFIG_BLK_DEV_BSG)
         struct bsg_class_device bsg_dev;
   #endif
+ +
+ +#ifdef CONFIG_BLK_DEV_THROTTLING
+ +      /* Throttle data */
+ +      struct throtl_data *td;
+ +#endif
   };
   
   #define QUEUE_FLAG_CLUSTER    0       /* cluster several segments into 1 */
@@@ -472,56 -464,6 +474,6 @@@ static inline void queue_flag_clear(uns
         __clear_bit(flag, &q->queue_flags);
   }
   
- enum {
-       /*
-        * Hardbarrier is supported with one of the following methods.
-        *
-        * NONE         : hardbarrier unsupported
-        * DRAIN        : ordering by draining is enough
-        * DRAIN_FLUSH  : ordering by draining w/ pre and post flushes
-        * DRAIN_FUA    : ordering by draining w/ pre flush and FUA write
-        * TAG          : ordering by tag is enough
-        * TAG_FLUSH    : ordering by tag w/ pre and post flushes
-        * TAG_FUA      : ordering by tag w/ pre flush and FUA write
-        */
-       QUEUE_ORDERED_BY_DRAIN          = 0x01,
-       QUEUE_ORDERED_BY_TAG            = 0x02,
-       QUEUE_ORDERED_DO_PREFLUSH       = 0x10,
-       QUEUE_ORDERED_DO_BAR            = 0x20,
-       QUEUE_ORDERED_DO_POSTFLUSH      = 0x40,
-       QUEUE_ORDERED_DO_FUA            = 0x80,
- 
-       QUEUE_ORDERED_NONE              = 0x00,
- 
-       QUEUE_ORDERED_DRAIN             = QUEUE_ORDERED_BY_DRAIN |
-                                         QUEUE_ORDERED_DO_BAR,
-       QUEUE_ORDERED_DRAIN_FLUSH       = QUEUE_ORDERED_DRAIN |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_POSTFLUSH,
-       QUEUE_ORDERED_DRAIN_FUA         = QUEUE_ORDERED_DRAIN |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_FUA,
- 
-       QUEUE_ORDERED_TAG               = QUEUE_ORDERED_BY_TAG |
-                                         QUEUE_ORDERED_DO_BAR,
-       QUEUE_ORDERED_TAG_FLUSH         = QUEUE_ORDERED_TAG |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_POSTFLUSH,
-       QUEUE_ORDERED_TAG_FUA           = QUEUE_ORDERED_TAG |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_FUA,
- 
-       /*
-        * Ordered operation sequence
-        */
-       QUEUE_ORDSEQ_STARTED    = 0x01, /* flushing in progress */
-       QUEUE_ORDSEQ_DRAIN      = 0x02, /* waiting for the queue to be drained */
-       QUEUE_ORDSEQ_PREFLUSH   = 0x04, /* pre-flushing in progress */
-       QUEUE_ORDSEQ_BAR        = 0x08, /* original barrier req in progress */
-       QUEUE_ORDSEQ_POSTFLUSH  = 0x10, /* post-flushing in progress */
-       QUEUE_ORDSEQ_DONE       = 0x20,
- };
- 
   #define blk_queue_plugged(q)  test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
   #define blk_queue_tagged(q)   test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
   #define blk_queue_stopped(q)  test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
@@@ -531,7 -473,6 +483,6 @@@
   #define blk_queue_nonrot(q)   test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
   #define blk_queue_io_stat(q)  test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
   #define blk_queue_add_random(q)       test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
- #define blk_queue_flushing(q) ((q)->ordseq)
   #define blk_queue_stackable(q)        \
         test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
   #define blk_queue_discard(q)  test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
@@@ -602,7 -543,8 +553,8 @@@ static inline void blk_clear_queue_full
    * it already be started by driver.
    */
   #define RQ_NOMERGE_FLAGS      \
-       (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
+       (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
+        REQ_FLUSH | REQ_FUA)
   #define rq_mergeable(rq)      \
         (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
          (((rq)->cmd_flags & REQ_DISCARD) || \
@@@ -861,7 -803,7 +813,7 @@@ extern void blk_queue_max_segment_size(
   extern void blk_queue_max_discard_sectors(struct request_queue *q,
                 unsigned int max_discard_sectors);
   extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
- -extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+ +extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
   extern void blk_queue_alignment_offset(struct request_queue *q,
                                        unsigned int alignment);
   extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@@ -891,12 -833,8 +843,8 @@@ extern void blk_queue_update_dma_alignm
   extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
   extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
   extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
+ extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
   extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
- extern int blk_queue_ordered(struct request_queue *, unsigned);
- extern bool blk_do_ordered(struct request_queue *, struct request **);
- extern unsigned blk_ordered_cur_seq(struct request_queue *);
- extern unsigned blk_ordered_req_seq(struct request *);
- extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
   
   extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
   extern void blk_dump_rq_flags(struct request *, char *);
@@@ -929,27 -867,20 +877,20 @@@ static inline struct request *blk_map_q
                 return NULL;
         return bqt->tag_index[tag];
   }
- enum{
-       BLKDEV_WAIT,    /* wait for completion */
-       BLKDEV_BARRIER, /* issue request with barrier */
-       BLKDEV_SECURE,  /* secure discard */
- };
- #define BLKDEV_IFL_WAIT               (1 << BLKDEV_WAIT)
- #define BLKDEV_IFL_BARRIER    (1 << BLKDEV_BARRIER)
- #define BLKDEV_IFL_SECURE     (1 << BLKDEV_SECURE)
- extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
-                       unsigned long);
+ 
+ #define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
+ 
+ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
   extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
   extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-                       sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
- static inline int sb_issue_discard(struct super_block *sb,
-                                  sector_t block, sector_t nr_blocks)
+                       sector_t nr_sects, gfp_t gfp_mask);
+ static inline int sb_issue_discard(struct super_block *sb, sector_t block,
+               sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
   {
-       block <<= (sb->s_blocksize_bits - 9);
-       nr_blocks <<= (sb->s_blocksize_bits - 9);
-       return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
-                                  BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+       return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
+                                   nr_blocks << (sb->s_blocksize_bits - 9),
+                                   gfp_mask, flags);
   }
   
   extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@@ -1014,7 -945,7 +955,7 @@@ static inline unsigned int queue_physic
         return q->limits.physical_block_size;
   }
   
- -static inline int bdev_physical_block_size(struct block_device *bdev)
+ +static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
   {
         return queue_physical_block_size(bdev_get_queue(bdev));
   }
@@@ -1103,11 -1034,11 +1044,11 @@@ static inline int queue_dma_alignment(s
         return q ? q->dma_alignment : 511;
   }
   
- -static inline int blk_rq_aligned(struct request_queue *q, void *addr,
+ +static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                  unsigned int len)
   {
         unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
- -      return !((unsigned long)addr & alignment) && !(len & alignment);
+ +      return !(addr & alignment) && !(len & alignment);
   }
   
   /* assumes size > 256 */
@@@ -1137,7 -1068,6 +1078,7 @@@ static inline void put_dev_sector(Secto
   
   struct work_struct;
   int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+ +int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
   
   #ifdef CONFIG_BLK_CGROUP
   /*
@@@ -1181,24 -1111,6 +1122,24 @@@ static inline uint64_t rq_io_start_time
   }
   #endif
   
+ +#ifdef CONFIG_BLK_DEV_THROTTLING
+ +extern int blk_throtl_init(struct request_queue *q);
+ +extern void blk_throtl_exit(struct request_queue *q);
+ +extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+ +extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
+ +extern void throtl_shutdown_timer_wq(struct request_queue *q);
+ +#else /* CONFIG_BLK_DEV_THROTTLING */
+ +static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+ +{
+ +      return 0;
+ +}
+ +
+ +static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+ +static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
+ +static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
+ +static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
+ +#endif /* CONFIG_BLK_DEV_THROTTLING */
+ +
   #define MODULE_ALIAS_BLOCKDEV(major,minor) \
         MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
   #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
@@@ -1242,13 -1154,8 +1183,13 @@@ struct blk_integrity 
   extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
   extern void blk_integrity_unregister(struct gendisk *);
   extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
- -extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
- -extern int blk_rq_count_integrity_sg(struct request *);
+ +extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+ +                                 struct scatterlist *);
+ +extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+ +extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
+ +                                struct request *);
+ +extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
+ +                                 struct bio *);
   
   static inline
   struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@@ -1269,32 -1176,16 +1210,32 @@@ static inline int blk_integrity_rq(stru
         return bio_integrity(rq->bio);
   }
   
+ +static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+ +                                                  unsigned int segs)
+ +{
+ +      q->limits.max_integrity_segments = segs;
+ +}
+ +
+ +static inline unsigned short
+ +queue_max_integrity_segments(struct request_queue *q)
+ +{
+ +      return q->limits.max_integrity_segments;
+ +}
+ +
   #else /* CONFIG_BLK_DEV_INTEGRITY */
   
   #define blk_integrity_rq(rq)                  (0)
- -#define blk_rq_count_integrity_sg(a)          (0)
- -#define blk_rq_map_integrity_sg(a, b)         (0)
+ +#define blk_rq_count_integrity_sg(a, b)               (0)
+ +#define blk_rq_map_integrity_sg(a, b, c)      (0)
   #define bdev_get_integrity(a)                 (0)
   #define blk_get_integrity(a)                  (0)
   #define blk_integrity_compare(a, b)           (0)
   #define blk_integrity_register(a, b)          (0)
   #define blk_integrity_unregister(a)           do { } while (0);
+ +#define blk_queue_max_integrity_segments(a, b)        do { } while (0);
+ +#define queue_max_integrity_segments(a)               (0)
+ +#define blk_integrity_merge_rq(a, b, c)               (0)
+ +#define blk_integrity_merge_bio(a, b, c)      (0)
   
   #endif /* CONFIG_BLK_DEV_INTEGRITY */
   
diff --combined include/linux/fs.h

index 0a81b87ea15813902fbb64108fa8931ffc80cce0,34a1cbcb56154670272e0c2753580cb4a2699ee7..4f34ff6e55585b365db2c419c878e46c67d6d9b0
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -135,12 -135,12 +135,12 @@@ struct inodes_stat_t 
    *                    immediately after submission. The write equivalent
    *                    of READ_SYNC.
    * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
-  * WRITE_BARRIER      Like WRITE_SYNC, but tells the block layer that all
-  *                    previously submitted writes must be safely on storage
-  *                    before this one is started. Also guarantees that when
-  *                    this write is complete, it itself is also safely on
-  *                    storage. Prevents reordering of writes on both sides
-  *                    of this IO.
+  * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
+  * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
+  *                    non-volatile media on completion.
+  * WRITE_FLUSH_FUA    Combination of WRITE_FLUSH and FUA. The IO is preceded
+  *                    by a cache flush and data is guaranteed to be on
+  *                    non-volatile media on completion.
    *
    */
   #define RW_MASK                       REQ_WRITE
@@@ -156,16 -156,12 +156,12 @@@
   #define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
   #define WRITE_ODIRECT_PLUG    (WRITE | REQ_SYNC)
   #define WRITE_META            (WRITE | REQ_META)
- #define WRITE_BARRIER         (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
-                                REQ_HARDBARRIER)
- 
- /*
-  * These aren't really reads or writes, they pass down information about
-  * parts of device that are now unused by the file system.
-  */
- #define DISCARD_NOBARRIER     (WRITE | REQ_DISCARD)
- #define DISCARD_BARRIER               (WRITE | REQ_DISCARD | REQ_HARDBARRIER)
- #define DISCARD_SECURE                (DISCARD_NOBARRIER | REQ_SECURE)
+ #define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+                                REQ_FLUSH)
+ #define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+                                REQ_FUA)
+ #define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+                                REQ_FLUSH | REQ_FUA)
   
   #define SEL_IN                1
   #define SEL_OUT               2
@@@ -1093,6 -1089,10 +1089,6 @@@ struct file_lock 
   
   #include <linux/fcntl.h>
   
- -/* temporary stubs for BKL removal */
- -#define lock_flocks() lock_kernel()
- -#define unlock_flocks() unlock_kernel()
- -
   extern void send_sigio(struct fown_struct *fown, int fd, int band);
   
   #ifdef CONFIG_FILE_LOCKING
@@@ -1131,8 -1131,6 +1127,8 @@@ extern int vfs_setlease(struct file *, 
   extern int lease_modify(struct file_lock **, int);
   extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
   extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+ +extern void lock_flocks(void);
+ +extern void unlock_flocks(void);
   #else /* !CONFIG_FILE_LOCKING */
   static inline int fcntl_getlk(struct file *file, struct flock __user *user)
   {
@@@ -1275,14 -1273,6 +1271,14 @@@ static inline int lock_may_write(struc
         return 1;
   }
   
+ +static inline void lock_flocks(void)
+ +{
+ +}
+ +
+ +static inline void unlock_flocks(void)
+ +{
+ +}
+ +
   #endif /* !CONFIG_FILE_LOCKING */
   
   
@@@ -1390,7 -1380,7 +1386,7 @@@ struct super_block 
          * Saved mount options for lazy filesystems using
          * generic_show_options()
          */
- -      char *s_options;
+ +      char __rcu *s_options;
   };
   
   extern struct timespec current_fs_time(struct super_block *sb);
@@@ -2384,8 -2374,6 +2380,8 @@@ extern ssize_t simple_write_to_buffer(v
   
   extern int generic_file_fsync(struct file *, int);
   
+ +extern int generic_check_addressable(unsigned, u64);
+ +
   #ifdef CONFIG_MIGRATION
   extern int buffer_migrate_page(struct address_space *,
                                 struct page *, struct page *);
@@@ -2462,7 -2450,6 +2458,7 @@@ static const struct file_operations __f
         .release = simple_attr_release,                                 \
         .read    = simple_attr_read,                                    \
         .write   = simple_attr_write,                                   \
+ +      .llseek  = generic_file_llseek,                                 \
   };
   
   static inline void __attribute__((format(printf, 1, 2)))
author	Linus Torvalds <[email protected]>
	Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)
committer	Linus Torvalds <[email protected]>
	Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)
		1	2
block/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-settings.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk.h	patch \|	diff1 \|	diff2 \|	blob \| history
block/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/brd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/drbd/drbd_int.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/drbd/drbd_receiver.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/loop.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/pktcdvd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/ps3disk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/virtio_blk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/xen-blkfront.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-snap.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/s390/block/dasd.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/rgrp.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jbd/commit.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jbd2/commit.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nilfs2/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history