#
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
- blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
+ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
+obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
return;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!new_io)
+ if (!new_io) {
+ part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
- else {
+ } else {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
+ rq->part = part;
}
part_stat_unlock();
rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
+ rq->part = NULL;
}
EXPORT_SYMBOL(blk_rq_init);
{
struct request_queue *q = rq->q;
- if (&q->bar_rq != rq) {
+ if (&q->flush_rq != rq) {
if (error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
if (bio->bi_size == 0)
bio_endio(bio, error);
} else {
-
/*
- * Okay, this is the barrier request in progress, just
- * record the error;
+ * Okay, this is the sequenced flush request in
+ * progress, just record the error;
*/
- if (error && !q->orderr)
- q->orderr = error;
+ if (error && !q->flush_err)
+ q->flush_err = error;
}
}
del_timer_sync(&q->unplug_timer);
del_timer_sync(&q->timeout);
cancel_work_sync(&q->unplug_work);
+ throtl_shutdown_timer_wq(q);
}
EXPORT_SYMBOL(blk_sync_queue);
if (q->elevator)
elevator_exit(q->elevator);
+ blk_throtl_exit(q);
+
blk_put_queue(q);
}
EXPORT_SYMBOL(blk_cleanup_queue);
return NULL;
}
+ if (blk_throtl_init(q)) {
+ kmem_cache_free(blk_requestq_cachep, q);
+ return NULL;
+ }
+
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
init_timer(&q->unplug_timer);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
+ INIT_LIST_HEAD(&q->pending_flushes);
INIT_WORK(&q->unplug_work, blk_unplug_work);
kobject_init(&q->kobj, &blk_queue_ktype);
rl->starved[is_sync] = 0;
priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
+ if (priv) {
rl->elvpriv++;
- if (blk_queue_io_stat(q))
- rw_flags |= REQ_IO_STAT;
+ /*
+ * Don't do stats for non-priv requests
+ */
+ if (blk_queue_io_stat(q))
+ rw_flags |= REQ_IO_STAT;
+ }
+
spin_unlock_irq(q->queue_lock);
rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
}
EXPORT_SYMBOL(blk_insert_request);
- /*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
- static inline void add_request(struct request_queue *q, struct request *req)
- {
- drive_stat_acct(req, 1);
-
- /*
- * elevator indicated where it wants this request to be
- * inserted at elevator_merge time
- */
- __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
- }
-
static void part_round_stats_single(int cpu, struct hd_struct *part,
unsigned long now)
{
const bool sync = !!(bio->bi_rw & REQ_SYNC);
const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ int where = ELEVATOR_INSERT_SORT;
int rw_flags;
- if ((bio->bi_rw & REQ_HARDBARRIER) &&
- (q->next_ordered == QUEUE_ORDERED_NONE)) {
+ /* REQ_HARDBARRIER is no more */
+ if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
+ "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
+
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
spin_lock_irq(q->queue_lock);
- if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+ if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ where = ELEVATOR_INSERT_FRONT;
+ goto get_rq;
+ }
+
+ if (elv_queue_empty(q))
goto get_rq;
el_ret = elv_merge(q, &req, bio);
req->cpu = blk_cpu_to_group(smp_processor_id());
if (queue_should_plug(q) && elv_queue_empty(q))
blk_plug_device(q);
- add_request(q, req);
+
+ /* insert the request into the elevator */
+ drive_stat_acct(req, 1);
+ __elv_add_request(q, req, where, 0);
out:
if (unplug || !queue_should_plug(q))
__generic_unplug_device(q);
if (bio_check_eod(bio, nr_sectors))
goto end_io;
+ /*
+ * Filter flush bio's early so that make_request based
+ * drivers without flush support don't have to worry
+ * about them.
+ */
+ if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+ bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+ if (!nr_sectors) {
+ err = 0;
+ goto end_io;
+ }
+ }
+
if ((bio->bi_rw & REQ_DISCARD) &&
(!blk_queue_discard(q) ||
((bio->bi_rw & REQ_SECURE) &&
goto end_io;
}
+ blk_throtl_bio(q, &bio);
+
+ /*
+ * If bio = NULL, bio has been throttled and will be submitted
+ * later.
+ */
+ if (!bio)
+ break;
+
trace_block_bio_queue(q, bio);
ret = q->make_request_fn(q, bio);
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
- printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+ printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
- bdevname(bio->bi_bdev, b));
+ bdevname(bio->bi_bdev, b),
+ count);
}
}
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
part_stat_unlock();
}
static void blk_account_io_done(struct request *req)
{
/*
- * Account IO completion. bar_rq isn't accounted as a normal
- * IO on queueing nor completion. Accounting the containing
- * request is enough.
+ * Account IO completion. flush_rq isn't accounted as a
+ * normal IO on queueing nor completion. Accounting the
+ * containing request is enough.
*/
- if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+ if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
unsigned long duration = jiffies - req->start_time;
const int rw = rq_data_dir(req);
struct hd_struct *part;
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
static void __blk_rq_prep_clone(struct request *dst, struct request *src)
{
dst->cpu = src->cpu;
- dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
- if (src->cmd_flags & REQ_DISCARD)
- dst->cmd_flags |= REQ_DISCARD;
+ dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
dst->cmd_type = src->cmd_type;
dst->__sector = blk_rq_pos(src);
dst->__data_len = blk_rq_bytes(src);
}
EXPORT_SYMBOL(kblockd_schedule_work);
+int kblockd_schedule_delayed_work(struct request_queue *q,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ return queue_delayed_work(kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+
int __init blk_dev_init(void)
{
BUILD_BUG_ON(__REQ_NR_BITS > 8 *
void blk_set_default_limits(struct queue_limits *lim)
{
lim->max_segments = BLK_MAX_SEGMENTS;
+ lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = BLK_DEF_MAX_SECTORS;
*/
if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
dma = 1;
- q->limits.bounce_pfn = max_low_pfn;
+ q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
#else
if (b_pfn < blk_max_low_pfn)
dma = 1;
* hardware can operate on without reverting to read-modify-write
* operations.
*/
-void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
{
q->limits.physical_block_size = size;
}
EXPORT_SYMBOL(blk_queue_io_opt);
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
/**
* blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
* @t: the stacking driver (top)
b->seg_boundary_mask);
t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+ t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+ b->max_integrity_segments);
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+ /**
+ * blk_queue_flush - configure queue's cache flush capability
+ * @q: the request queue for the device
+ * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ *
+ * Tell block layer cache flush capability of @q. If it supports
+ * flushing, REQ_FLUSH should be set. If it supports bypassing
+ * write cache for individual writes, REQ_FUA should be set.
+ */
+ void blk_queue_flush(struct request_queue *q, unsigned int flush)
+ {
+ WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+
+ if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+ flush &= ~REQ_FUA;
+
+ q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+ }
+ EXPORT_SYMBOL_GPL(blk_queue_flush);
+
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
*/
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
+ struct request *blk_do_flush(struct request_queue *q, struct request *rq);
+
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
while (1) {
while (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
- if (blk_do_ordered(q, &rq))
+ if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+ rq == &q->flush_rq)
+ return rq;
+ rq = blk_do_flush(q, rq);
+ if (rq)
return rq;
}
int blk_dev_init(void);
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
/*
* Return the threshold (number of used requests) at which the queue is
* considered to be congested. It include a little hysteresis to keep the
return q->nr_congestion_off;
}
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-#define rq_for_each_integrity_segment(bvl, _rq, _iter) \
- __rq_for_each_bio(_iter.bio, _rq) \
- bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
-
-#endif /* BLK_DEV_INTEGRITY */
-
static inline int blk_cpu_to_group(int cpu)
{
int group = NR_CPUS;
/* all seems OK */
part = add_partition(disk, partno, start, length,
- ADDPART_FLAG_NONE);
+ ADDPART_FLAG_NONE, NULL);
mutex_unlock(&bdev->bd_mutex);
return IS_ERR(part) ? PTR_ERR(part) : 0;
case BLKPG_DEL_PARTITION:
static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
uint64_t len, int secure)
{
- unsigned long flags = BLKDEV_IFL_WAIT;
+ unsigned long flags = 0;
if (start & 511)
return -EINVAL;
if (start + len > (bdev->bd_inode->i_size >> 9))
return -EINVAL;
if (secure)
- flags |= BLKDEV_IFL_SECURE;
+ flags |= BLKDEV_DISCARD_SECURE;
return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
}
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/radix-tree.h>
#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
#include <linux/slab.h>
/*
* Look up and return a brd's page for a given sector.
*/
+static DEFINE_MUTEX(brd_mutex);
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
pgoff_t idx;
* ram device BLKFLSBUF has special semantics, we want to actually
* release and destroy the ramdisk data.
*/
- lock_kernel();
+ mutex_lock(&brd_mutex);
mutex_lock(&bdev->bd_mutex);
error = -EBUSY;
if (bdev->bd_openers <= 1) {
error = 0;
}
mutex_unlock(&bdev->bd_mutex);
- unlock_kernel();
+ mutex_unlock(&brd_mutex);
return error;
}
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
* NOTE that the payload starts at a long aligned offset,
* regardless of 32 or 64 bit arch!
*/
-struct p_header {
+struct p_header80 {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
u8 payload[0];
} __packed;
-/* 8 bytes. packet FIXED for the next century! */
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+ u16 magic; /* use DRBD_MAGIC_BIG here */
+ u16 command;
+ u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
+ u8 payload[0];
+} __packed;
+
+union p_header {
+ struct p_header80 h80;
+ struct p_header95 h95;
+};
/*
* short commands, packets without payload, plain p_header:
*/
/* these defines must not be changed without changing the protocol version */
-#define DP_HARDBARRIER 1
-#define DP_RW_SYNC 2
+#define DP_HARDBARRIER 1 /* depricated */
+#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
+#define DP_UNPLUG 8 /* equals REQ_UNPLUG */
+#define DP_FUA 16 /* equals REQ_FUA */
+#define DP_FLUSH 32 /* equals REQ_FLUSH */
+#define DP_DISCARD 64 /* equals REQ_DISCARD */
struct p_data {
- struct p_header head;
+ union p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
- struct p_header head;
+ struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
struct p_block_req {
- struct p_header head;
+ struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
*/
struct p_handshake {
- struct p_header head; /* 8 bytes */
+ struct p_header80 head; /* 8 bytes */
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
/* 80 bytes, FIXED for the next century */
struct p_barrier {
- struct p_header head;
+ struct p_header80 head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
- struct p_header head;
+ struct p_header80 head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
- struct p_header head;
+ struct p_header80 head;
u32 rate;
/* Since protocol version 88 and higher. */
} __packed;
struct p_rs_param_89 {
- struct p_header head;
+ struct p_header80 head;
u32 rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
+struct p_rs_param_95 {
+ struct p_header80 head;
+ u32 rate;
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+ u32 c_plan_ahead;
+ u32 c_delay_target;
+ u32 c_fill_target;
+ u32 c_max_rate;
+} __packed;
+
enum drbd_conn_flags {
CF_WANT_LOSE = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
- struct p_header head;
+ struct p_header80 head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
} __packed;
struct p_uuids {
- struct p_header head;
+ struct p_header80 head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
- struct p_header head;
+ struct p_header80 head;
u64 uuid;
} __packed;
struct p_sizes {
- struct p_header head;
+ struct p_header80 head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
} __packed;
struct p_state {
- struct p_header head;
+ struct p_header80 head;
u32 state;
} __packed;
struct p_req_state {
- struct p_header head;
+ struct p_header80 head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
- struct p_header head;
+ struct p_header80 head;
u32 retcode;
} __packed;
} __packed;
struct p_discard {
- struct p_header head;
+ struct p_header80 head;
u64 block_id;
u32 seq_num;
u32 pad;
};
struct p_compressed_bm {
- struct p_header head;
+ struct p_header80 head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
u8 code[0];
} __packed;
-struct p_delay_probe {
- struct p_header head;
- u32 seq_num; /* sequence number to match the two probe packets */
- u32 offset; /* usecs the probe got sent after the reference time point */
+struct p_delay_probe93 {
+ struct p_header80 head;
+ u32 seq_num; /* sequence number to match the two probe packets */
+ u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
/* DCBP: Drbd Compressed Bitmap Packet ... */
* so we need to use the fixed size 4KiB page size
* most architechtures have used for a long time.
*/
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
#if (PAGE_SIZE < 4096)
#endif
union p_polymorph {
- struct p_header header;
+ union p_header header;
struct p_handshake handshake;
struct p_data data;
struct p_block_ack block_ack;
struct p_barrier barrier;
struct p_barrier_ack barrier_ack;
struct p_rs_param_89 rs_param_89;
+ struct p_rs_param_95 rs_param_95;
struct p_protocol protocol;
struct p_sizes sizes;
struct p_uuids uuids;
struct p_req_state req_state;
struct p_req_state_reply req_state_reply;
struct p_block_req block_req;
+ struct p_delay_probe93 delay_probe93;
+ struct p_rs_uuid rs_uuid;
} __packed;
/**********************************************************************/
struct list_head requests; /* requests before */
struct drbd_tl_epoch *next; /* pointer to the next barrier */
unsigned int br_number; /* the barriers identifier. */
- int n_req; /* number of requests attached before this barrier */
+ int n_writes; /* number of requests attached before this barrier */
};
struct drbd_request;
struct drbd_epoch_entry {
struct drbd_work w;
struct hlist_node colision;
- struct drbd_epoch *epoch;
+ struct drbd_epoch *epoch; /* for writes */
struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
/* see comments on ee flag bits below */
unsigned long flags;
sector_t sector;
- u64 block_id;
+ union {
+ u64 block_id;
+ struct digest_info *digest;
+ };
};
/* ee flag bits.
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
+
+ /* This ee has a pointer to a digest instead of a block id */
+ __EE_HAS_DIGEST,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
/* global flag bits */
enum {
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
SEND_PING, /* whether asender should send a ping asap */
- STOP_SYNC_TIMER, /* tell timer to cancel itself */
UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
BITMAP_IO, /* suspend application io;
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
+ GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
NET_CONGESTED, /* The data socket is congested */
* the peer, if it changed there as well. */
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
+ NEW_CUR_UUID, /* Create new current UUID when thawing IO */
+ AL_SUSPENDED, /* Activity logging is currently suspended. */
};
struct drbd_bitmap; /* opaque for drbd_conf */
/* THINK maybe we actually want to use the default "event/%s" worker threads
* or similar in linux 2.6, which uses per cpu data and threads.
- *
- * To be general, this might need a spin_lock member.
- * For now, please use the mdev->req_lock to protect list_head,
- * see drbd_queue_work below.
*/
struct drbd_work_queue {
struct list_head q;
WO_bio_barrier
};
+struct fifo_buffer {
+ int *values;
+ unsigned int head_index;
+ unsigned int size;
+};
+
struct drbd_conf {
/* things that are stored as / read from meta data on disk */
unsigned long flags;
unsigned int ko_count;
struct drbd_work resync_work,
unplug_work,
+ go_diskless,
md_sync_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
+#ifdef DRBD_DEBUG_MD_SYNC
+ struct {
+ unsigned int line;
+ const char* func;
+ } last_md_mark_dirty;
+#endif
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
union drbd_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
+ wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
unsigned long rs_start;
/* cumulated time in PausedSyncX state [unit jiffies] */
unsigned long rs_paused;
+ /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+ unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
- unsigned long rs_mark_left;
+ unsigned long rs_mark_left[DRBD_SYNC_MARKS];
/* marks's time [unit jiffies] */
- unsigned long rs_mark_time;
- /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
- unsigned long rs_same_csum;
+ unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+ /* current index into rs_mark_{left,time} */
+ int rs_last_mark;
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
- struct list_head active_ee; /* IO in progress */
- struct list_head sync_ee; /* IO in progress */
+ struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+ struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* send ack */
- struct list_head read_ee; /* IO in progress */
+ struct list_head read_ee; /* IO in progress (any read) */
struct list_head net_ee; /* zero-copy network send in progress */
struct hlist_head *ee_hash; /* is proteced by req_lock! */
unsigned int ee_hash_s;
int next_barrier_nr;
struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
- atomic_t pp_in_use;
+ atomic_t pp_in_use; /* allocated from page pool */
+ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
u64 ed_uuid; /* UUID of the exposed data */
struct mutex state_mutex;
char congestion_reason; /* Why we where congested... */
+ atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+ int rs_last_sect_ev; /* counter to compare with */
+ int rs_last_events; /* counter of read or write "events" (unit sectors)
+ * on the lower level device when we last looked. */
+ int c_sync_rate; /* current resync rate after syncer throttle magic */
+ struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+ int rs_planed; /* resync sectors already planed */
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(struct drbd_conf *mdev);
+enum drbd_req_event;
+extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_conf *mdev);
extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
extern int _drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags);
#define USE_DATA_SOCKET 1
#define USE_META_SOCKET 0
extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size);
extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
char *data, size_t size);
extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_block_req *rp);
extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp);
+ struct p_data *dp, int data_size);
extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
sector_t sector, int blksize, u64 block_id);
extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+#ifndef DRBD_DEBUG_MD_SYNC
extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+#else
+#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
+extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
+ unsigned int line, const char *func);
+#endif
extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
void (*done)(struct drbd_conf *, int),
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+extern void drbd_go_diskless(struct drbd_conf *mdev);
/* Meta data layout
* Bit 1 ==> local node thinks this block needs to be synced.
*/
+#define SLEEP_TIME (HZ/10)
+
#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
#endif
/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 6 all IO in one 32K block make it to the same slot of the
+ * With a value of 8 all IO in one 128K block make it to the same slot of the
* hash table. */
-#define HT_SHIFT 6
+#define HT_SHIFT 8
#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
+#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+
/* Number of elements in the app_reads_hash */
#define APP_R_HSIZE 15
/* bm_find_next variants for use while you hold drbd_bm_lock() */
extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
extern int drbd_bm_rs_done(struct drbd_conf *mdev);
/* for receive_bitmap */
extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
int force);
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
const unsigned rw, const int fault_type);
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
sector_t sector,
unsigned int data_size,
gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+ int is_net);
+#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
+#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+extern void drbd_free_tl_hash(struct drbd_conf *mdev);
/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
* mess with get_fs/set_fs, we know we are KERNEL_DS always. */
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
}
}
-static inline void
-_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
-{
- list_add_tail(&w->list, &q->q);
- up(&q->s);
-}
-
static inline void
drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
{
static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
enum drbd_packets cmd)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
}
static inline int drbd_send_ping(struct drbd_conf *mdev)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
}
static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
}
static inline void put_net_conf(struct drbd_conf *mdev)
{
if (atomic_dec_and_test(&mdev->net_cnt))
- wake_up(&mdev->misc_wait);
+ wake_up(&mdev->net_cnt_wait);
}
/**
static inline void put_ldev(struct drbd_conf *mdev)
{
+ int i = atomic_dec_return(&mdev->local_cnt);
__release(local);
- if (atomic_dec_and_test(&mdev->local_cnt))
+ D_ASSERT(i >= 0);
+ if (i == 0) {
+ if (mdev->state.disk == D_FAILED)
+ drbd_go_diskless(mdev);
wake_up(&mdev->misc_wait);
- D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
+ }
}
#ifndef __CHECKER__
return 1;
}
+static inline int is_susp(union drbd_state s)
+{
+ return s.susp || s.susp_nod || s.susp_fen;
+}
+
static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
- if (mdev->state.susp)
+ if (is_susp(mdev->state))
return 0;
if (test_bit(SUSPEND_IO, &mdev->flags))
return 0;
if (test_bit(MD_NO_BARRIER, &mdev->flags))
return;
- r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
- BLKDEV_IFL_WAIT);
+ r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
if (r) {
set_bit(MD_NO_BARRIER, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_ee(mdev, e);
+ drbd_free_net_ee(mdev, e);
}
/**
* Is also used from inside an other spin_lock_irq(&mdev->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
{
+ atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
+
if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
i = page_chain_free(page);
else {
drbd_pp_vacant += i;
spin_unlock(&drbd_pp_lock);
}
- atomic_sub(i, &mdev->pp_in_use);
- i = atomic_read(&mdev->pp_in_use);
+ i = atomic_sub_return(i, a);
if (i < 0)
- dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
+ dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+ is_net ? "pp_in_use_by_net" : "pp_in_use", i);
wake_up(&drbd_pp_wait);
}
e->size = data_size;
e->flags = 0;
e->sector = sector;
- e->sector = sector;
e->block_id = id;
return e;
return NULL;
}
-void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
{
- drbd_pp_free(mdev, e->pages);
+ if (e->flags & EE_HAS_DIGEST)
+ kfree(e->digest);
+ drbd_pp_free(mdev, e->pages, is_net);
D_ASSERT(atomic_read(&e->pending_bios) == 0);
D_ASSERT(hlist_unhashed(&e->colision));
mempool_free(e, drbd_ee_mempool);
LIST_HEAD(work_list);
struct drbd_epoch_entry *e, *t;
int count = 0;
+ int is_net = list == &mdev->net_ee;
spin_lock_irq(&mdev->req_lock);
list_splice_init(list, &work_list);
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &work_list, w.list) {
- drbd_free_ee(mdev, e);
+ drbd_free_some_ee(mdev, e, is_net);
count++;
}
return count;
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_ee(mdev, e);
+ drbd_free_net_ee(mdev, e);
/* possible callbacks here:
* e_end_block, and e_end_resync_block, e_send_discard_ack.
static int drbd_send_fp(struct drbd_conf *mdev,
struct socket *sock, enum drbd_packets cmd)
{
- struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ struct p_header80 *h = &mdev->data.sbuf.header.h80;
return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
}
static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
{
- struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ struct p_header80 *h = &mdev->data.rbuf.header.h80;
int rr;
rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
D_ASSERT(!mdev->data.socket);
- if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
- dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
-
if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
return -2;
drbd_thread_start(&mdev->asender);
+ if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
+ drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
+ put_ldev(mdev);
+ }
+
if (!drbd_send_protocol(mdev))
return -1;
drbd_send_sync_param(mdev, &mdev->sync_conf);
return -1;
}
-static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
{
+ union p_header *h = &mdev->data.rbuf.header;
int r;
r = drbd_recv(mdev, h, sizeof(*h));
-
if (unlikely(r != sizeof(*h))) {
dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
return FALSE;
- };
- h->command = be16_to_cpu(h->command);
- h->length = be16_to_cpu(h->length);
- if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ }
+
+ if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
+ *cmd = be16_to_cpu(h->h80.command);
+ *packet_size = be16_to_cpu(h->h80.length);
+ } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
+ *cmd = be16_to_cpu(h->h95.command);
+ *packet_size = be32_to_cpu(h->h95.length);
+ } else {
+ dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->h80.magic),
+ be16_to_cpu(h->h80.command),
+ be16_to_cpu(h->h80.length));
return FALSE;
}
mdev->last_received = jiffies;
if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
- NULL, BLKDEV_IFL_WAIT);
+ NULL);
if (rv) {
dev_err(DEV, "local disk flush failed with status %d\n", rv);
/* would rather check on EOPNOTSUPP, but that is not reliable.
return 1;
}
-static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
int rv, issue_flush;
- struct p_barrier *p = (struct p_barrier *)h;
+ struct p_barrier *p = &mdev->data.rbuf.barrier;
struct drbd_epoch *epoch;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-
- rv = drbd_recv(mdev, h->payload, h->length);
- ERR_IF(rv != h->length) return FALSE;
-
inc_unacked(mdev);
if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
data_size -= rr;
}
kunmap(page);
- drbd_pp_free(mdev, page);
+ drbd_pp_free(mdev, page, 0);
return rv;
}
list_add(&e->w.list, &mdev->sync_ee);
spin_unlock_irq(&mdev->req_lock);
+ atomic_add(data_size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
+
drbd_free_ee(mdev, e);
fail:
put_ldev(mdev);
return FALSE;
}
-static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct drbd_request *req;
sector_t sector;
- unsigned int header_size, data_size;
int ok;
- struct p_data *p = (struct p_data *)h;
-
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
+ struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
return ok;
}
-static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
- unsigned int header_size, data_size;
int ok;
- struct p_data *p = (struct p_data *)h;
-
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
+ struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
D_ASSERT(p->block_id == ID_SYNCER);
ok = drbd_drain_block(mdev, data_size);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
}
+ atomic_add(data_size >> 9, &mdev->rs_sect_in);
+
return ok;
}
return ret;
}
+static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+{
+ if (mdev->agreed_pro_version >= 95)
+ return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_FUA : 0) |
+ (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+ else
+ return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+}
+
/* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
struct drbd_epoch_entry *e;
- struct p_data *p = (struct p_data *)h;
- int header_size, data_size;
+ struct p_data *p = &mdev->data.rbuf.data;
int rw = WRITE;
u32 dp_flags;
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
-
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not write mirrored data block "
mdev->peer_seq++;
spin_unlock(&mdev->peer_seq_lock);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
atomic_inc(&mdev->current_epoch->epoch_size);
return drbd_drain_block(mdev, data_size);
}
spin_unlock(&mdev->epoch_lock);
dp_flags = be32_to_cpu(p->dp_flags);
- if (dp_flags & DP_HARDBARRIER) {
- dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
- /* rw |= REQ_HARDBARRIER; */
- }
- if (dp_flags & DP_RW_SYNC)
- rw |= REQ_SYNC | REQ_UNPLUG;
+ rw |= write_flags_to_bio(mdev, dp_flags);
+
if (dp_flags & DP_MAY_SET_IN_SYNC)
e->flags |= EE_MAY_SET_IN_SYNC;
if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ hlist_del_init(&e->colision);
+ spin_unlock_irq(&mdev->req_lock);
+ if (e->flags & EE_CALL_AL_COMPLETE_IO)
+ drbd_al_complete_io(mdev, e->sector);
+
out_interrupted:
/* yes, the epoch_size now is imbalanced.
* but we drop the connection anyways, so we don't have a chance to
return FALSE;
}
-static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+{
+ struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+ unsigned long db, dt, dbdt;
+ int curr_events;
+ int throttle = 0;
+
+ /* feature disabled? */
+ if (mdev->sync_conf.c_min_rate == 0)
+ return 0;
+
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&mdev->rs_sect_ev);
+ if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+ unsigned long rs_left;
+ int i;
+
+ mdev->rs_last_events = curr_events;
+
+ /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+ * approx. */
+ i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
+ rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+
+ dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = mdev->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+
+ if (dbdt > mdev->sync_conf.c_min_rate)
+ throttle = 1;
+ }
+ return throttle;
+}
+
+
+static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
{
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
struct drbd_epoch_entry *e;
struct digest_info *di = NULL;
- int size, digest_size;
+ int size, verb;
unsigned int fault_type;
- struct p_block_req *p =
- (struct p_block_req *)h;
- const int brps = sizeof(*p)-sizeof(*h);
-
- if (drbd_recv(mdev, h->payload, brps) != brps)
- return FALSE;
+ struct p_block_req *p = &mdev->data.rbuf.block_req;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
}
if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
- if (__ratelimit(&drbd_ratelimit_state))
+ verb = 1;
+ switch (cmd) {
+ case P_DATA_REQUEST:
+ drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+ break;
+ case P_RS_DATA_REQUEST:
+ case P_CSUM_RS_REQUEST:
+ case P_OV_REQUEST:
+ drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+ break;
+ case P_OV_REPLY:
+ verb = 0;
+ dec_rs_pending(mdev);
+ drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+ break;
+ default:
+ dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+ cmdname(cmd));
+ }
+ if (verb && __ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not satisfy peer's read request, "
"no local data.\n");
- drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
- P_NEG_RS_DREPLY , p);
- return drbd_drain_block(mdev, h->length - brps);
+
+ /* drain possibly payload */
+ return drbd_drain_block(mdev, digest_size);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
return FALSE;
}
- switch (h->command) {
+ switch (cmd) {
case P_DATA_REQUEST:
e->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
- break;
+ /* application IO, don't drbd_rs_begin_io */
+ goto submit;
+
case P_RS_DATA_REQUEST:
e->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
- /* Eventually this should become asynchronously. Currently it
- * blocks the whole receiver just to delay the reading of a
- * resync data block.
- * the drbd_work_queue mechanism is made for this...
- */
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted,
- * probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
- }
break;
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
- digest_size = h->length - brps ;
di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
if (!di)
goto out_free_e;
di->digest_size = digest_size;
di->digest = (((char *)di)+sizeof(struct digest_info));
+ e->digest = di;
+ e->flags |= EE_HAS_DIGEST;
+
if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
goto out_free_e;
- e->block_id = (u64)(unsigned long)di;
- if (h->command == P_CSUM_RS_REQUEST) {
+ if (cmd == P_CSUM_RS_REQUEST) {
D_ASSERT(mdev->agreed_pro_version >= 89);
e->w.cb = w_e_end_csum_rs_req;
- } else if (h->command == P_OV_REPLY) {
+ } else if (cmd == P_OV_REPLY) {
e->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
- break;
- }
-
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted, probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
+ /* drbd_rs_begin_io done when we sent this request,
+ * but accounting still needs to be done. */
+ goto submit_for_resync;
}
break;
case P_OV_REQUEST:
- if (mdev->state.conn >= C_CONNECTED &&
- mdev->state.conn != C_VERIFY_T)
- dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
- drbd_conn_str(mdev->state.conn));
if (mdev->ov_start_sector == ~(sector_t)0 &&
mdev->agreed_pro_version >= 90) {
mdev->ov_start_sector = sector;
}
e->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
- /* Eventually this should become asynchronous. Currently it
- * blocks the whole receiver just to delay the reading of a
- * resync data block.
- * the drbd_work_queue mechanism is made for this...
- */
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted,
- * probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
- }
break;
-
default:
dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(h->command));
+ cmdname(cmd));
fault_type = DRBD_FAULT_MAX;
+ goto out_free_e;
}
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ /* Throttle, drbd_rs_begin_io and submit should become asynchronous
+ * wrt the receiver, but it is not as straightforward as it may seem.
+ * Various places in the resync start and stop logic assume resync
+ * requests are processed in order, requeuing this on the worker thread
+ * introduces a bunch of new code for synchronization between threads.
+ *
+ * Unlimited throttling before drbd_rs_begin_io may stall the resync
+ * "forever", throttling after drbd_rs_begin_io will lock that extent
+ * for application writes for the same time. For now, just throttle
+ * here, where the rest of the code expects the receiver to sleep for
+ * a while, anyways.
+ */
+
+ /* Throttle before drbd_rs_begin_io, as that locks out application IO;
+ * this defers syncer requests for some time, before letting at least
+ * on request through. The resync controller on the receiving side
+ * will adapt to the incoming rate accordingly.
+ *
+ * We cannot throttle here if remote is Primary/SyncTarget:
+ * we would also throttle its application reads.
+ * In that case, throttling is done on the SyncTarget only.
+ */
+ if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
+ msleep(100);
+ if (drbd_rs_begin_io(mdev, e->sector))
+ goto out_free_e;
+submit_for_resync:
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
+
+submit:
inc_unacked(mdev);
+ spin_lock_irq(&mdev->req_lock);
+ list_add_tail(&e->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->req_lock);
if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
+ /* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
out_free_e:
- kfree(di);
put_ldev(mdev);
drbd_free_ee(mdev, e);
return FALSE;
return 1;
}
-static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_protocol *p = (struct p_protocol *)h;
- int header_size, data_size;
+ struct p_protocol *p = &mdev->data.rbuf.protocol;
int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
int p_want_lose, p_two_primaries, cf;
char p_integrity_alg[SHARED_SECRET_MAX] = "";
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
-
p_proto = be32_to_cpu(p->protocol);
p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
return tfm;
}
-static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
{
int ok = TRUE;
- struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+ struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
unsigned int header_size, data_size, exp_max_sz;
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
const int apv = mdev->agreed_pro_version;
+ int *rs_plan_s = NULL;
+ int fifo_size = 0;
exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
+ SHARED_SECRET_MAX
- : /* 89 */ sizeof(struct p_rs_param_89);
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
- if (h->length > exp_max_sz) {
+ if (packet_size > exp_max_sz) {
dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
- h->length, exp_max_sz);
+ packet_size, exp_max_sz);
return FALSE;
}
if (apv <= 88) {
- header_size = sizeof(struct p_rs_param) - sizeof(*h);
- data_size = h->length - header_size;
- } else /* apv >= 89 */ {
- header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
- data_size = h->length - header_size;
+ header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
+ } else if (apv <= 94) {
+ header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
+ D_ASSERT(data_size == 0);
+ } else {
+ header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
D_ASSERT(data_size == 0);
}
/* initialize verify_alg and csums_alg */
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
return FALSE;
mdev->sync_conf.rate = be32_to_cpu(p->rate);
}
}
+ if (apv > 94) {
+ mdev->sync_conf.rate = be32_to_cpu(p->rate);
+ mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
+ mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
+ mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+
+ fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+ rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+ if (!rs_plan_s) {
+ dev_err(DEV, "kmalloc of fifo_buffer failed");
+ goto disconnect;
+ }
+ }
+ }
spin_lock(&mdev->peer_seq_lock);
/* lock against drbd_nl_syncer_conf() */
mdev->csums_tfm = csums_tfm;
dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
}
+ if (fifo_size != mdev->rs_plan_s.size) {
+ kfree(mdev->rs_plan_s.values);
+ mdev->rs_plan_s.values = rs_plan_s;
+ mdev->rs_plan_s.size = fifo_size;
+ mdev->rs_planed = 0;
+ }
spin_unlock(&mdev->peer_seq_lock);
}
(unsigned long long)a, (unsigned long long)b);
}
-static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_sizes *p = (struct p_sizes *)h;
+ struct p_sizes *p = &mdev->data.rbuf.sizes;
enum determine_dev_size dd = unchanged;
unsigned int max_seg_s;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
p_size = be64_to_cpu(p->d_size);
p_usize = be64_to_cpu(p->u_size);
* we still need to figure out whether we accept that. */
mdev->p_size = p_size;
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
if (get_ldev(mdev)) {
warn_if_differ_considerably(mdev, "lower level device sizes",
p_size, drbd_get_max_capacity(mdev->ldev));
if (mdev->agreed_pro_version < 94)
max_seg_s = be32_to_cpu(p->max_segment_size);
+ else if (mdev->agreed_pro_version == 94)
+ max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
max_seg_s = DRBD_MAX_SEGMENT_SIZE;
return TRUE;
}
-static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_uuids *p = (struct p_uuids *)h;
+ struct p_uuids *p = &mdev->data.rbuf.uuids;
u64 *p_uuid;
int i;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
drbd_md_sync(mdev);
}
put_ldev(mdev);
+ } else if (mdev->state.disk < D_INCONSISTENT &&
+ mdev->state.role == R_PRIMARY) {
+ /* I am a diskless primary, the peer just created a new current UUID
+ for me. */
+ drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
}
/* Before we test for the disk state, we should wait until an eventually
return ms;
}
-static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_req_state *p = (struct p_req_state *)h;
+ struct p_req_state *p = &mdev->data.rbuf.req_state;
union drbd_state mask, val;
int rv;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
return TRUE;
}
-static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_state *p = (struct p_state *)h;
- enum drbd_conns nconn, oconn;
- union drbd_state ns, peer_state;
+ struct p_state *p = &mdev->data.rbuf.state;
+ union drbd_state os, ns, peer_state;
enum drbd_disk_state real_peer_disk;
+ enum chg_state_flags cs_flags;
int rv;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
- return FALSE;
-
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
peer_state.i = be32_to_cpu(p->state);
real_peer_disk = peer_state.disk;
spin_lock_irq(&mdev->req_lock);
retry:
- oconn = nconn = mdev->state.conn;
+ os = ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
- if (nconn == C_WF_REPORT_PARAMS)
- nconn = C_CONNECTED;
+ /* peer says his disk is uptodate, while we think it is inconsistent,
+ * and this happens while we think we have a sync going on. */
+ if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+ os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+ /* If we are (becoming) SyncSource, but peer is still in sync
+ * preparation, ignore its uptodate-ness to avoid flapping, it
+ * will change to inconsistent once the peer reaches active
+ * syncing states.
+ * It may have changed syncer-paused flags, however, so we
+ * cannot ignore this completely. */
+ if (peer_state.conn > C_CONNECTED &&
+ peer_state.conn < C_SYNC_SOURCE)
+ real_peer_disk = D_INCONSISTENT;
+
+ /* if peer_state changes to connected at the same time,
+ * it explicitly notifies us that it finished resync.
+ * Maybe we should finish it up, too? */
+ else if (os.conn >= C_SYNC_SOURCE &&
+ peer_state.conn == C_CONNECTED) {
+ if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
+ drbd_resync_finished(mdev);
+ return TRUE;
+ }
+ }
+
+ /* peer says his disk is inconsistent, while we think it is uptodate,
+ * and this happens while the peer still thinks we have a sync going on,
+ * but we think we are already done with the sync.
+ * We ignore this to avoid flapping pdsk.
+ * This should not happen, if the peer is a recent version of drbd. */
+ if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+ os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+ real_peer_disk = D_UP_TO_DATE;
+
+ if (ns.conn == C_WF_REPORT_PARAMS)
+ ns.conn = C_CONNECTED;
if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
get_ldev_if_state(mdev, D_NEGOTIATING)) {
int cr; /* consider resync */
/* if we established a new connection */
- cr = (oconn < C_CONNECTED);
+ cr = (os.conn < C_CONNECTED);
/* if we had an established connection
* and one of the nodes newly attaches a disk */
- cr |= (oconn == C_CONNECTED &&
+ cr |= (os.conn == C_CONNECTED &&
(peer_state.disk == D_NEGOTIATING ||
- mdev->state.disk == D_NEGOTIATING));
+ os.disk == D_NEGOTIATING));
/* if we have both been inconsistent, and the peer has been
* forced to be UpToDate with --overwrite-data */
cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
/* if we had been plain connected, and the admin requested to
* start a sync by "invalidate" or "invalidate-remote" */
- cr |= (oconn == C_CONNECTED &&
+ cr |= (os.conn == C_CONNECTED &&
(peer_state.conn >= C_STARTING_SYNC_S &&
peer_state.conn <= C_WF_BITMAP_T));
if (cr)
- nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+ ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
put_ldev(mdev);
- if (nconn == C_MASK) {
- nconn = C_CONNECTED;
+ if (ns.conn == C_MASK) {
+ ns.conn = C_CONNECTED;
if (mdev->state.disk == D_NEGOTIATING) {
drbd_force_state(mdev, NS(disk, D_DISKLESS));
} else if (peer_state.disk == D_NEGOTIATING) {
} else {
if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
return FALSE;
- D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+ D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
return FALSE;
}
}
spin_lock_irq(&mdev->req_lock);
- if (mdev->state.conn != oconn)
+ if (mdev->state.i != os.i)
goto retry;
clear_bit(CONSIDER_RESYNC, &mdev->flags);
- ns.i = mdev->state.i;
- ns.conn = nconn;
ns.peer = peer_state.role;
ns.pdsk = real_peer_disk;
ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
- if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+ if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
ns.disk = mdev->new_state_tmp.disk;
-
- rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+ cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+ if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+ for temporal network outages! */
+ spin_unlock_irq(&mdev->req_lock);
+ dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+ tl_clear(mdev);
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
+ return FALSE;
+ }
+ rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
return FALSE;
}
- if (oconn > C_WF_REPORT_PARAMS) {
- if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+ if (os.conn > C_WF_REPORT_PARAMS) {
+ if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
peer_state.disk != D_NEGOTIATING ) {
/* we want resync, peer has not yet decided to sync... */
/* Nowadays only used when forcing a node into primary role and
return TRUE;
}
-static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+ struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
wait_event(mdev->misc_wait,
mdev->state.conn == C_WF_SYNC_UUID ||
/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
/* Here the _drbd_uuid_ functions are right, current should
_not_ be rotated into the history */
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
enum receive_bitmap_ret { OK, DONE, FAILED };
static enum receive_bitmap_ret
-receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
- unsigned long *buffer, struct bm_xfer_ctx *c)
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
+ unsigned long *buffer, struct bm_xfer_ctx *c)
{
unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
unsigned want = num_words * sizeof(long);
- if (want != h->length) {
- dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+ if (want != data_size) {
+ dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
return FAILED;
}
if (want == 0)
u64 tmp;
unsigned long s = c->bit_offset;
unsigned long e;
- int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+ int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
int toggle = DCBP_get_start(p);
int have;
int bits;
const char *direction, struct bm_xfer_ctx *c)
{
/* what would it take to transfer it "plaintext" */
- unsigned plain = sizeof(struct p_header) *
+ unsigned plain = sizeof(struct p_header80) *
((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+ c->bm_words * sizeof(long);
unsigned total = c->bytes[0] + c->bytes[1];
in order to be agnostic to the 32 vs 64 bits issue.
returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct bm_xfer_ctx c;
void *buffer;
enum receive_bitmap_ret ret;
int ok = FALSE;
+ struct p_header80 *h = &mdev->data.rbuf.header.h80;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
};
do {
- if (h->command == P_BITMAP) {
- ret = receive_bitmap_plain(mdev, h, buffer, &c);
- } else if (h->command == P_COMPRESSED_BITMAP) {
+ if (cmd == P_BITMAP) {
+ ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
+ } else if (cmd == P_COMPRESSED_BITMAP) {
/* MAYBE: sanity check that we speak proto >= 90,
* and the feature is enabled! */
struct p_compressed_bm *p;
- if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+ if (data_size > BM_PACKET_PAYLOAD_BYTES) {
dev_err(DEV, "ReportCBitmap packet too large\n");
goto out;
}
/* use the page buff */
p = buffer;
memcpy(p, h, sizeof(*h));
- if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+ if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
goto out;
- if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
- dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+ if (data_size <= (sizeof(*p) - sizeof(p->head))) {
+ dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
return FAILED;
}
ret = decode_bitmap_c(mdev, p, &c);
} else {
- dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+ dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
goto out;
}
- c.packets[h->command == P_BITMAP]++;
- c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+ c.packets[cmd == P_BITMAP]++;
+ c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
if (ret != OK)
break;
- if (!drbd_recv_header(mdev, h))
+ if (!drbd_recv_header(mdev, &cmd, &data_size))
goto out;
} while (ret == OK);
if (ret == FAILED)
return ok;
}
-static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
+static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
/* TODO zero copy sink :) */
static char sink[128];
int size, want, r;
- if (!silent)
- dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
- h->command, h->length);
+ dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+ cmd, data_size);
- size = h->length;
+ size = data_size;
while (size > 0) {
want = min_t(int, size, sizeof(sink));
r = drbd_recv(mdev, sink, want);
return size == 0;
}
-static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
-{
- return receive_skip_(mdev, h, 0);
-}
-
-static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
-{
- return receive_skip_(mdev, h, 1);
-}
-
-static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
if (mdev->state.disk >= D_INCONSISTENT)
drbd_kick_lo(mdev);
return TRUE;
}
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
-
-static drbd_cmd_handler_f drbd_default_handler[] = {
- [P_DATA] = receive_Data,
- [P_DATA_REPLY] = receive_DataReply,
- [P_RS_DATA_REPLY] = receive_RSDataReply,
- [P_BARRIER] = receive_Barrier,
- [P_BITMAP] = receive_bitmap,
- [P_COMPRESSED_BITMAP] = receive_bitmap,
- [P_UNPLUG_REMOTE] = receive_UnplugRemote,
- [P_DATA_REQUEST] = receive_DataRequest,
- [P_RS_DATA_REQUEST] = receive_DataRequest,
- [P_SYNC_PARAM] = receive_SyncParam,
- [P_SYNC_PARAM89] = receive_SyncParam,
- [P_PROTOCOL] = receive_protocol,
- [P_UUIDS] = receive_uuids,
- [P_SIZES] = receive_sizes,
- [P_STATE] = receive_state,
- [P_STATE_CHG_REQ] = receive_req_state,
- [P_SYNC_UUID] = receive_sync_uuid,
- [P_OV_REQUEST] = receive_DataRequest,
- [P_OV_REPLY] = receive_DataRequest,
- [P_CSUM_RS_REQUEST] = receive_DataRequest,
- [P_DELAY_PROBE] = receive_skip_silent,
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
+
+struct data_cmd {
+ int expect_payload;
+ size_t pkt_size;
+ drbd_cmd_handler_f function;
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+ [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
+ [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
+ [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+ [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+ [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+ [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
+ [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
+ [P_STATE] = { 0, sizeof(struct p_state), receive_state },
+ [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
+ [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+ [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
/* anything missing from this table is in
* the asender_tbl, see get_asender_cmd */
- [P_MAX_CMD] = NULL,
+ [P_MAX_CMD] = { 0, 0, NULL },
};
-static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
-static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+/* All handler functions that expect a sub-header get that sub-heder in
+ mdev->data.rbuf.header.head.payload.
+
+ Usually in mdev->data.rbuf.header.head the callback can find the usual
+ p_header, but they may not rely on that. Since there is also p_header95 !
+ */
static void drbdd(struct drbd_conf *mdev)
{
- drbd_cmd_handler_f handler;
- struct p_header *header = &mdev->data.rbuf.header;
+ union p_header *header = &mdev->data.rbuf.header;
+ unsigned int packet_size;
+ enum drbd_packets cmd;
+ size_t shs; /* sub header size */
+ int rv;
while (get_t_state(&mdev->receiver) == Running) {
drbd_thread_current_set_cpu(mdev);
- if (!drbd_recv_header(mdev, header)) {
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
- }
+ if (!drbd_recv_header(mdev, &cmd, &packet_size))
+ goto err_out;
- if (header->command < P_MAX_CMD)
- handler = drbd_cmd_handler[header->command];
- else if (P_MAY_IGNORE < header->command
- && header->command < P_MAX_OPT_CMD)
- handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
- else if (header->command > P_MAX_OPT_CMD)
- handler = receive_skip;
- else
- handler = NULL;
+ if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
+ dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+ goto err_out;
+ }
- if (unlikely(!handler)) {
- dev_err(DEV, "unknown packet type %d, l: %d!\n",
- header->command, header->length);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
+ shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
+ rv = drbd_recv(mdev, &header->h80.payload, shs);
+ if (unlikely(rv != shs)) {
+ dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
+ goto err_out;
}
- if (unlikely(!handler(mdev, header))) {
- dev_err(DEV, "error receiving %s, l: %d!\n",
- cmdname(header->command), header->length);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
+
+ if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
+ dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+ goto err_out;
}
- }
-}
-static void drbd_fail_pending_reads(struct drbd_conf *mdev)
-{
- struct hlist_head *slot;
- struct hlist_node *pos;
- struct hlist_node *tmp;
- struct drbd_request *req;
- int i;
+ rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
- /*
- * Application READ requests
- */
- spin_lock_irq(&mdev->req_lock);
- for (i = 0; i < APP_R_HSIZE; i++) {
- slot = mdev->app_reads_hash+i;
- hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
- /* it may (but should not any longer!)
- * be on the work queue; if that assert triggers,
- * we need to also grab the
- * spin_lock_irq(&mdev->data.work.q_lock);
- * and list_del_init here. */
- D_ASSERT(list_empty(&req->w.list));
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(req, connection_lost_while_pending);
+ if (unlikely(!rv)) {
+ dev_err(DEV, "error receiving %s, l: %d!\n",
+ cmdname(cmd), packet_size);
+ goto err_out;
}
}
- for (i = 0; i < APP_R_HSIZE; i++)
- if (!hlist_empty(mdev->app_reads_hash+i))
- dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
- "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
- memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
- spin_unlock_irq(&mdev->req_lock);
+ if (0) {
+ err_out:
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ }
+ /* If we leave here, we probably want to update at least the
+ * "Connected" indicator on stable storage. Do so explicitly here. */
+ drbd_md_sync(mdev);
}
void drbd_flush_workqueue(struct drbd_conf *mdev)
wait_for_completion(&barr.done);
}
+void drbd_free_tl_hash(struct drbd_conf *mdev)
+{
+ struct hlist_head *h;
+
+ spin_lock_irq(&mdev->req_lock);
+
+ if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
+ spin_unlock_irq(&mdev->req_lock);
+ return;
+ }
+ /* paranoia code */
+ for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+ (int)(h - mdev->ee_hash), h->first);
+ kfree(mdev->ee_hash);
+ mdev->ee_hash = NULL;
+ mdev->ee_hash_s = 0;
+
+ /* paranoia code */
+ for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+ (int)(h - mdev->tl_hash), h->first);
+ kfree(mdev->tl_hash);
+ mdev->tl_hash = NULL;
+ mdev->tl_hash_s = 0;
+ spin_unlock_irq(&mdev->req_lock);
+}
+
static void drbd_disconnect(struct drbd_conf *mdev)
{
enum drbd_fencing_p fp;
drbd_thread_stop(&mdev->asender);
drbd_free_sock(mdev);
+ /* wait for current activity to cease. */
spin_lock_irq(&mdev->req_lock);
_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
/* make sure syncer is stopped and w_resume_next_sg queued */
del_timer_sync(&mdev->resync_timer);
- set_bit(STOP_SYNC_TIMER, &mdev->flags);
resync_timer_fn((unsigned long)mdev);
/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
kfree(mdev->p_uuid);
mdev->p_uuid = NULL;
- if (!mdev->state.susp)
+ if (!is_susp(mdev->state))
tl_clear(mdev);
- drbd_fail_pending_reads(mdev);
-
dev_info(DEV, "Connection closed\n");
drbd_md_sync(mdev);
put_ldev(mdev);
}
- if (mdev->state.role == R_PRIMARY) {
- if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
- enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
- drbd_request_state(mdev, NS(pdsk, nps));
- }
- }
+ if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
+ drbd_try_outdate_peer_async(mdev);
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
spin_unlock_irq(&mdev->req_lock);
if (os.conn == C_DISCONNECTING) {
- struct hlist_head *h;
- wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+ wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
- /* we must not free the tl_hash
- * while application io is still on the fly */
- wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
-
- spin_lock_irq(&mdev->req_lock);
- /* paranoia code */
- for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
- (int)(h - mdev->ee_hash), h->first);
- kfree(mdev->ee_hash);
- mdev->ee_hash = NULL;
- mdev->ee_hash_s = 0;
-
- /* paranoia code */
- for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
- (int)(h - mdev->tl_hash), h->first);
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
- spin_unlock_irq(&mdev->req_lock);
+ if (!is_susp(mdev->state)) {
+ /* we must not free the tl_hash
+ * while application io is still on the fly */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+ drbd_free_tl_hash(mdev);
+ }
crypto_free_hash(mdev->cram_hmac_tfm);
mdev->cram_hmac_tfm = NULL;
i = drbd_release_ee(mdev, &mdev->net_ee);
if (i)
dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+ i = atomic_read(&mdev->pp_in_use_by_net);
+ if (i)
+ dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
i = atomic_read(&mdev->pp_in_use);
if (i)
dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
- (struct p_header *)p, sizeof(*p), 0 );
+ (struct p_header80 *)p, sizeof(*p), 0 );
mutex_unlock(&mdev->data.mutex);
return ok;
}
{
/* ASSERT current == mdev->receiver ... */
struct p_handshake *p = &mdev->data.rbuf.handshake;
- const int expect = sizeof(struct p_handshake)
- -sizeof(struct p_header);
+ const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
+ unsigned int length;
+ enum drbd_packets cmd;
int rv;
rv = drbd_send_handshake(mdev);
if (!rv)
return 0;
- rv = drbd_recv_header(mdev, &p->head);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
return 0;
- if (p->head.command != P_HAND_SHAKE) {
+ if (cmd != P_HAND_SHAKE) {
dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
- cmdname(p->head.command), p->head.command);
+ cmdname(cmd), cmd);
return -1;
}
- if (p->head.length != expect) {
+ if (length != expect) {
dev_err(DEV, "expected HandShake length: %u, received: %u\n",
- expect, p->head.length);
+ expect, length);
return -1;
}
char *response = NULL;
char *right_response = NULL;
char *peers_ch = NULL;
- struct p_header p;
unsigned int key_len = strlen(mdev->net_conf->shared_secret);
unsigned int resp_size;
struct hash_desc desc;
+ enum drbd_packets cmd;
+ unsigned int length;
int rv;
desc.tfm = mdev->cram_hmac_tfm;
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &p);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
- if (p.command != P_AUTH_CHALLENGE) {
+ if (cmd != P_AUTH_CHALLENGE) {
dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
- cmdname(p.command), p.command);
+ cmdname(cmd), cmd);
rv = 0;
goto fail;
}
- if (p.length > CHALLENGE_LEN*2) {
+ if (length > CHALLENGE_LEN * 2) {
dev_err(DEV, "expected AuthChallenge payload too big.\n");
rv = -1;
goto fail;
}
- peers_ch = kmalloc(p.length, GFP_NOIO);
+ peers_ch = kmalloc(length, GFP_NOIO);
if (peers_ch == NULL) {
dev_err(DEV, "kmalloc of peers_ch failed\n");
rv = -1;
goto fail;
}
- rv = drbd_recv(mdev, peers_ch, p.length);
+ rv = drbd_recv(mdev, peers_ch, length);
- if (rv != p.length) {
+ if (rv != length) {
dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
rv = 0;
goto fail;
}
sg_init_table(&sg, 1);
- sg_set_buf(&sg, peers_ch, p.length);
+ sg_set_buf(&sg, peers_ch, length);
rv = crypto_hash_digest(&desc, &sg, sg.length, response);
if (rv) {
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &p);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
- if (p.command != P_AUTH_RESPONSE) {
+ if (cmd != P_AUTH_RESPONSE) {
dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
- cmdname(p.command), p.command);
+ cmdname(cmd), cmd);
rv = 0;
goto fail;
}
- if (p.length != resp_size) {
+ if (length != resp_size) {
dev_err(DEV, "expected AuthResponse payload of wrong size\n");
rv = 0;
goto fail;
/* ********* acknowledge sender ******** */
-static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_req_state_reply *p = (struct p_req_state_reply *)h;
return TRUE;
}
-static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
{
return drbd_send_ping_ack(mdev);
}
-static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
{
/* restore idle timeout */
mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
return TRUE;
}
-static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
- drbd_rs_complete_io(mdev, sector);
- drbd_set_in_sync(mdev, sector, blksize);
- /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
- mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, sector);
+ drbd_set_in_sync(mdev, sector, blksize);
+ /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+ mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ put_ldev(mdev);
+ }
dec_rs_pending(mdev);
+ atomic_add(blksize >> 9, &mdev->rs_sect_in);
return TRUE;
}
return TRUE;
}
-static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
_ack_id_to_req, __func__ , what);
}
-static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
_ack_id_to_req, __func__ , neg_acked);
}
-static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
_ar_id_to_req, __func__ , neg_acked);
}
-static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
sector_t sector;
int size;
return TRUE;
}
-static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_barrier_ack *p = (struct p_barrier_ack *)h;
return TRUE;
}
-static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
struct drbd_work *w;
else
ov_oos_print(mdev);
+ if (!get_ldev(mdev))
+ return TRUE;
+
drbd_rs_complete_io(mdev, sector);
dec_rs_pending(mdev);
drbd_resync_finished(mdev);
}
}
+ put_ldev(mdev);
return TRUE;
}
-static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
+static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
{
- /* IGNORE */
return TRUE;
}
struct asender_cmd {
size_t pkt_size;
- int (*process)(struct drbd_conf *mdev, struct p_header *h);
+ int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
};
static struct asender_cmd *get_asender_cmd(int cmd)
/* anything missing from this table is in
* the drbd_cmd_handler (drbd_default_handler) table,
* see the beginning of drbdd() */
- [P_PING] = { sizeof(struct p_header), got_Ping },
- [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
+ [P_PING] = { sizeof(struct p_header80), got_Ping },
+ [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
[P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
[P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
- [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
+ [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
[P_MAX_CMD] = { 0, NULL },
};
if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
int drbd_asender(struct drbd_thread *thi)
{
struct drbd_conf *mdev = thi->mdev;
- struct p_header *h = &mdev->meta.rbuf.header;
+ struct p_header80 *h = &mdev->meta.rbuf.header.h80;
struct asender_cmd *cmd = NULL;
int rv, len;
void *buf = h;
int received = 0;
- int expect = sizeof(struct p_header);
+ int expect = sizeof(struct p_header80);
int empty;
sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
while (1) {
clear_bit(SIGNAL_ASENDER, &mdev->flags);
flush_signals(current);
- if (!drbd_process_done_ee(mdev)) {
- dev_err(DEV, "process_done_ee() = NOT_OK\n");
+ if (!drbd_process_done_ee(mdev))
goto reconnect;
- }
/* to avoid race with newly queued ACKs */
set_bit(SIGNAL_ASENDER, &mdev->flags);
spin_lock_irq(&mdev->req_lock);
if (received == expect && cmd == NULL) {
if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->magic),
+ be16_to_cpu(h->command),
+ be16_to_cpu(h->length));
goto reconnect;
}
cmd = get_asender_cmd(be16_to_cpu(h->command));
len = be16_to_cpu(h->length);
if (unlikely(cmd == NULL)) {
- dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->magic),
+ be16_to_cpu(h->command),
+ be16_to_cpu(h->length));
goto disconnect;
}
expect = cmd->pkt_size;
- ERR_IF(len != expect-sizeof(struct p_header))
+ ERR_IF(len != expect-sizeof(struct p_header80))
goto reconnect;
}
if (received == expect) {
buf = h;
received = 0;
- expect = sizeof(struct p_header);
+ expect = sizeof(struct p_header80);
cmd = NULL;
}
}
if (0) {
reconnect:
drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+ drbd_md_sync(mdev);
}
if (0) {
disconnect:
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ drbd_md_sync(mdev);
}
clear_bit(SIGNAL_ASENDER, &mdev->flags);
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* for invalidate_bdev() */
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/splice.h>
+#include <linux/sysfs.h>
#include <asm/uaccess.h>
+static DEFINE_MUTEX(loop_mutex);
static LIST_HEAD(loop_devices);
static DEFINE_MUTEX(loop_devices_mutex);
pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
if (bio_rw(bio) == WRITE) {
- bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
struct file *file = lo->lo_backing_file;
- if (barrier) {
- if (unlikely(!file->f_op->fsync)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
+ /* REQ_HARDBARRIER is deprecated */
+ if (bio->bi_rw & REQ_HARDBARRIER) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ if (bio->bi_rw & REQ_FLUSH) {
ret = vfs_fsync(file, 0);
- if (unlikely(ret)) {
+ if (unlikely(ret && ret != -EINVAL)) {
ret = -EIO;
goto out;
}
ret = lo_send(lo, bio, pos);
- if (barrier && !ret) {
+ if ((bio->bi_rw & REQ_FUA) && !ret) {
ret = vfs_fsync(file, 0);
- if (unlikely(ret))
+ if (unlikely(ret && ret != -EINVAL))
ret = -EIO;
}
} else
return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
}
+/* loop sysfs attributes */
+
+static ssize_t loop_attr_show(struct device *dev, char *page,
+ ssize_t (*callback)(struct loop_device *, char *))
+{
+ struct loop_device *l, *lo = NULL;
+
+ mutex_lock(&loop_devices_mutex);
+ list_for_each_entry(l, &loop_devices, lo_list)
+ if (disk_to_dev(l->lo_disk) == dev) {
+ lo = l;
+ break;
+ }
+ mutex_unlock(&loop_devices_mutex);
+
+ return lo ? callback(lo, page) : -EIO;
+}
+
+#define LOOP_ATTR_RO(_name) \
+static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
+static ssize_t loop_attr_do_show_##_name(struct device *d, \
+ struct device_attribute *attr, char *b) \
+{ \
+ return loop_attr_show(d, b, loop_attr_##_name##_show); \
+} \
+static struct device_attribute loop_attr_##_name = \
+ __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+
+static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+{
+ ssize_t ret;
+ char *p = NULL;
+
+ mutex_lock(&lo->lo_ctl_mutex);
+ if (lo->lo_backing_file)
+ p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+ mutex_unlock(&lo->lo_ctl_mutex);
+
+ if (IS_ERR_OR_NULL(p))
+ ret = PTR_ERR(p);
+ else {
+ ret = strlen(p);
+ memmove(buf, p, ret);
+ buf[ret++] = '\n';
+ buf[ret] = 0;
+ }
+
+ return ret;
+}
+
+static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+}
+
+static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+}
+
+static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+{
+ int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+
+ return sprintf(buf, "%s\n", autoclear ? "1" : "0");
+}
+
+LOOP_ATTR_RO(backing_file);
+LOOP_ATTR_RO(offset);
+LOOP_ATTR_RO(sizelimit);
+LOOP_ATTR_RO(autoclear);
+
+static struct attribute *loop_attrs[] = {
+ &loop_attr_backing_file.attr,
+ &loop_attr_offset.attr,
+ &loop_attr_sizelimit.attr,
+ &loop_attr_autoclear.attr,
+ NULL,
+};
+
+static struct attribute_group loop_attribute_group = {
+ .name = "loop",
+ .attrs= loop_attrs,
+};
+
+static int loop_sysfs_init(struct loop_device *lo)
+{
+ return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
+static void loop_sysfs_exit(struct loop_device *lo)
+{
+ sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
static int loop_set_fd(struct loop_device *lo, fmode_t mode,
struct block_device *bdev, unsigned int arg)
{
lo->lo_queue->unplug_fn = loop_unplug;
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
- blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+ blk_queue_flush(lo->lo_queue, REQ_FLUSH);
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
+ loop_sysfs_init(lo);
/* let user-space know about the new size */
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
return 0;
out_clr:
+ loop_sysfs_exit(lo);
lo->lo_thread = NULL;
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
set_capacity(lo->lo_disk, 0);
if (bdev) {
bd_set_size(bdev, 0);
+ loop_sysfs_exit(lo);
/* let user-space know about this change */
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
}
{
struct loop_device *lo = bdev->bd_disk->private_data;
- lock_kernel();
+ mutex_lock(&loop_mutex);
mutex_lock(&lo->lo_ctl_mutex);
lo->lo_refcnt++;
mutex_unlock(&lo->lo_ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&loop_mutex);
return 0;
}
struct loop_device *lo = disk->private_data;
int err;
- lock_kernel();
+ mutex_lock(&loop_mutex);
mutex_lock(&lo->lo_ctl_mutex);
if (--lo->lo_refcnt)
out:
mutex_unlock(&lo->lo_ctl_mutex);
out_unlocked:
- lock_kernel();
+ mutex_unlock(&loop_mutex);
return 0;
}
#include <linux/seq_file.h>
#include <linux/miscdevice.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <scsi/scsi_cmnd.h>
#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
+static DEFINE_MUTEX(pktcdvd_mutex);
static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
static struct proc_dir_entry *pkt_proc;
static int pktdev_major;
rq->timeout = 60*HZ;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_HARDBARRIER;
if (cgc->quiet)
rq->cmd_flags |= REQ_QUIET;
VPRINTK(DRIVER_NAME": entering open\n");
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
mutex_lock(&ctl_mutex);
pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
if (!pd) {
}
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return 0;
out_dec:
out:
VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
struct pktcdvd_device *pd = disk->private_data;
int ret = 0;
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
mutex_lock(&ctl_mutex);
pd->refcnt--;
BUG_ON(pd->refcnt < 0);
pkt_release_dev(pd, flush);
}
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
switch (cmd) {
case CDROMEJECT:
/*
VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
ret = -ENOTTY;
}
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
.compat_ioctl = pkt_ctl_compat_ioctl,
#endif
.owner = THIS_MODULE,
+ .llseek = no_llseek,
};
static struct miscdevice pkt_misc = {
memcpy(buf, dev->bounce_buf+offset, size);
offset += size;
flush_kernel_dcache_page(bvec->bv_page);
- bvec_kunmap_irq(bvec, &flags);
+ bvec_kunmap_irq(buf, &flags);
i++;
}
}
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);
- blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_flush(queue, REQ_FLUSH);
blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
#include <linux/hdreg.h>
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
}
}
- if (vbr->req->cmd_flags & REQ_HARDBARRIER)
- vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
-
sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
/*
return err;
}
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long data)
{
struct gendisk *disk = bdev->bd_disk;
struct virtio_blk *vblk = disk->private_data;
(void __user *)data);
}
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long param)
-{
- int ret;
-
- lock_kernel();
- ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
-
- return ret;
-}
-
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
vblk->disk->driverfs_dev = &vdev->dev;
index++;
- if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
- /*
- * If the FLUSH feature is supported we do have support for
- * flushing a volatile write cache on the host. Use that
- * to implement write barrier support.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
- } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
- /*
- * If the BARRIER feature is supported the host expects us
- * to order request by tags. This implies there is not
- * volatile write cache on the host, and that the host
- * never re-orders outstanding I/O. This feature is not
- * useful for real life scenarious and deprecated.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_TAG);
- } else {
- /*
- * If the FLUSH feature is not supported we must assume that
- * the host does not perform any kind of volatile write
- * caching. We still need to drain the queue to provider
- * proper barrier semantics.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
- }
+ /* configure queue flush support */
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+ blk_queue_flush(q, REQ_FLUSH);
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
};
static unsigned int features[] = {
- VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
- VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
- VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+ VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+ VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
};
/*
#include <linux/cdrom.h>
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <xen/xen.h>
unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
+static DEFINE_MUTEX(blkfront_mutex);
static const struct block_device_operations xlvbd_block_fops;
#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_RING_SIZE];
unsigned long shadow_free;
- int feature_barrier;
+ unsigned int feature_flush;
int is_ready;
};
}
- static int xlvbd_barrier(struct blkfront_info *info)
+ static void xlvbd_flush(struct blkfront_info *info)
{
- int err;
- const char *barrier;
-
- switch (info->feature_barrier) {
- case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
- case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
- case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
- default: return -EINVAL;
- }
-
- err = blk_queue_ordered(info->rq, info->feature_barrier);
-
- if (err)
- return err;
-
+ blk_queue_flush(info->rq, info->feature_flush);
printk(KERN_INFO "blkfront: %s: barriers %s\n",
- info->gd->disk_name, barrier);
- return 0;
+ info->gd->disk_name,
+ info->feature_flush ? "enabled" : "disabled");
}
info->rq = gd->queue;
info->gd = gd;
- xlvbd_barrier(info);
+ xlvbd_flush(info);
if (vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);
printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
info->gd->disk_name);
error = -EOPNOTSUPP;
- info->feature_barrier = QUEUE_ORDERED_NONE;
- xlvbd_barrier(info);
+ info->feature_flush = 0;
+ xlvbd_flush(info);
}
/* fall through */
case BLKIF_OP_READ:
/*
* If there's no "feature-barrier" defined, then it means
* we're dealing with a very old backend which writes
- * synchronously; draining will do what needs to get done.
+ * synchronously; nothing to do.
*
- * If there are barriers, then we can do full queued writes
- * with tagged barriers.
- *
- * If barriers are not supported, then there's no much we can
- * do, so just set ordering to NONE.
+ * If there are barriers, then we use flush.
*/
- if (err)
- info->feature_barrier = QUEUE_ORDERED_DRAIN;
- else if (barrier)
- info->feature_barrier = QUEUE_ORDERED_TAG;
- else
- info->feature_barrier = QUEUE_ORDERED_NONE;
+ info->feature_flush = 0;
+
+ /*
+ * The driver doesn't properly handled empty flushes, so
+ * lets disable barrier support for now.
+ */
+ #if 0
+ if (!err && barrier)
+ info->feature_flush = REQ_FLUSH;
+ #endif
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
struct blkfront_info *info;
int err = 0;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
info = disk->private_data;
if (!info) {
mutex_unlock(&info->mutex);
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return err;
}
struct block_device *bdev;
struct xenbus_device *xbdev;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
bdev = bdget_disk(disk, 0);
bdput(bdev);
}
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return 0;
}
return 0;
}
-#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
-
/*
* Return a minimum chunk size of all snapshots that have the specified origin.
* Return zero if the origin has no snapshots.
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
- if (unlikely(bio_empty_barrier(bio))) {
+ if (bio->bi_rw & REQ_FLUSH) {
bio->bi_bdev = s->cow->bdev;
return DM_MAPIO_REMAPPED;
}
int r = DM_MAPIO_REMAPPED;
chunk_t chunk;
- if (unlikely(bio_empty_barrier(bio))) {
+ if (bio->bi_rw & REQ_FLUSH) {
if (!map_context->target_request_nr)
bio->bi_bdev = s->origin->bdev;
else
struct dm_dev *dev = ti->private;
bio->bi_bdev = dev->bdev;
- if (unlikely(bio_empty_barrier(bio)))
+ if (bio->bi_rw & REQ_FLUSH)
return DM_MAPIO_REMAPPED;
/* Only tell snapshots if this is a write */
#include <linux/blkpg.h>
#include <linux/bio.h>
#include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
#define DM_COOKIE_LENGTH 24
+static DEFINE_MUTEX(dm_mutex);
static const char *_name = DM_NAME;
static unsigned int major = 0;
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
- #define DMF_QUEUE_IO_TO_THREAD 6
/*
* Work processed by per-device workqueue.
spinlock_t deferred_lock;
/*
- * An error from the barrier request currently being processed.
- */
- int barrier_error;
-
- /*
- * Protect barrier_error from concurrent endio processing
- * in request-based dm.
- */
- spinlock_t barrier_error_lock;
-
- /*
- * Processing queue (flush/barriers)
+ * Processing queue (flush)
*/
struct workqueue_struct *wq;
- struct work_struct barrier_work;
-
- /* A pointer to the currently processing pre/post flush request */
- struct request *flush_request;
/*
* The current mapping.
/* sysfs handle */
struct kobject kobj;
- /* zero-length barrier that will be cloned and submitted to targets */
- struct bio barrier_bio;
+ /* zero-length flush that will be cloned and submitted to targets */
+ struct bio flush_bio;
};
/*
{
struct mapped_device *md;
- lock_kernel();
+ mutex_lock(&dm_mutex);
spin_lock(&_minor_lock);
md = bdev->bd_disk->private_data;
out:
spin_unlock(&_minor_lock);
- unlock_kernel();
+ mutex_unlock(&dm_mutex);
return md ? 0 : -ENXIO;
}
{
struct mapped_device *md = disk->private_data;
- lock_kernel();
+ mutex_lock(&dm_mutex);
atomic_dec(&md->open_count);
dm_put(md);
- unlock_kernel();
+ mutex_unlock(&dm_mutex);
return 0;
}
/*
* After this is decremented the bio must not be touched if it is
- * a barrier.
+ * a flush.
*/
dm_disk(md)->part0.in_flight[rw] = pending =
atomic_dec_return(&md->pending[rw]);
*/
static void queue_io(struct mapped_device *md, struct bio *bio)
{
- down_write(&md->io_lock);
+ unsigned long flags;
- spin_lock_irq(&md->deferred_lock);
+ spin_lock_irqsave(&md->deferred_lock, flags);
bio_list_add(&md->deferred, bio);
- spin_unlock_irq(&md->deferred_lock);
-
- if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
- queue_work(md->wq, &md->work);
-
- up_write(&md->io_lock);
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
+ queue_work(md->wq, &md->work);
}
/*
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
- if (__noflush_suspending(md)) {
- if (!(io->bio->bi_rw & REQ_HARDBARRIER))
- bio_list_add_head(&md->deferred,
- io->bio);
- } else
+ if (__noflush_suspending(md))
+ bio_list_add_head(&md->deferred, io->bio);
+ else
/* noflush suspend was interrupted. */
io->error = -EIO;
spin_unlock_irqrestore(&md->deferred_lock, flags);
io_error = io->error;
bio = io->bio;
+ end_io_acct(io);
+ free_io(md, io);
+
+ if (io_error == DM_ENDIO_REQUEUE)
+ return;
- if (bio->bi_rw & REQ_HARDBARRIER) {
+ if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
/*
- * There can be just one barrier request so we use
- * a per-device variable for error reporting.
- * Note that you can't touch the bio after end_io_acct
- *
- * We ignore -EOPNOTSUPP for empty flush reported by
- * underlying devices. We assume that if the device
- * doesn't support empty barriers, it doesn't need
- * cache flushing commands.
+ * Preflush done for flush with data, reissue
+ * without REQ_FLUSH.
*/
- if (!md->barrier_error &&
- !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
- md->barrier_error = io_error;
- end_io_acct(io);
- free_io(md, io);
+ bio->bi_rw &= ~REQ_FLUSH;
+ queue_io(md, bio);
} else {
- end_io_acct(io);
- free_io(md, io);
-
- if (io_error != DM_ENDIO_REQUEUE) {
- trace_block_bio_complete(md->queue, bio);
-
- bio_endio(bio, io_error);
- }
+ /* done with normal IO or empty flush */
+ trace_block_bio_complete(md->queue, bio);
+ bio_endio(bio, io_error);
}
}
}
blk_update_request(tio->orig, 0, nr_bytes);
}
- static void store_barrier_error(struct mapped_device *md, int error)
- {
- unsigned long flags;
-
- spin_lock_irqsave(&md->barrier_error_lock, flags);
- /*
- * Basically, the first error is taken, but:
- * -EOPNOTSUPP supersedes any I/O error.
- * Requeue request supersedes any I/O error but -EOPNOTSUPP.
- */
- if (!md->barrier_error || error == -EOPNOTSUPP ||
- (md->barrier_error != -EOPNOTSUPP &&
- error == DM_ENDIO_REQUEUE))
- md->barrier_error = error;
- spin_unlock_irqrestore(&md->barrier_error_lock, flags);
- }
-
/*
* Don't touch any member of the md after calling this function because
* the md may be freed in dm_put() at the end of this function.
static void dm_end_request(struct request *clone, int error)
{
int rw = rq_data_dir(clone);
- int run_queue = 1;
- bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
rq->errors = clone->errors;
rq->resid_len = clone->resid_len;
}
free_rq_clone(clone);
-
- if (unlikely(is_barrier)) {
- if (unlikely(error))
- store_barrier_error(md, error);
- run_queue = 0;
- } else
- blk_end_request_all(rq, error);
-
- rq_completed(md, rw, run_queue);
+ blk_end_request_all(rq, error);
+ rq_completed(md, rw, true);
}
static void dm_unprep_request(struct request *rq)
struct request_queue *q = rq->q;
unsigned long flags;
- if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
- /*
- * Barrier clones share an original request.
- * Leave it to dm_end_request(), which handles this special
- * case.
- */
- dm_end_request(clone, DM_ENDIO_REQUEUE);
- return;
- }
-
dm_unprep_request(rq);
spin_lock_irqsave(q->queue_lock, flags);
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
- if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
- /*
- * Barrier clones share an original request. So can't use
- * softirq_done with the original.
- * Pass the clone to dm_done() directly in this special case.
- * It is safe (even if clone->q->queue_lock is held here)
- * because there is no I/O dispatching during the completion
- * of barrier clone.
- */
- dm_done(clone, error, true);
- return;
- }
-
tio->error = error;
rq->completion_data = clone;
blk_complete_request(rq);
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
- if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
- /*
- * Barrier clones share an original request.
- * Leave it to dm_end_request(), which handles this special
- * case.
- */
- BUG_ON(error > 0);
- dm_end_request(clone, error);
- return;
- }
-
rq->cmd_flags |= REQ_FAILED;
dm_complete_request(clone, error);
}
}
/*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
*/
static struct bio *split_bvec(struct bio *bio, sector_t sector,
unsigned short idx, unsigned int offset,
clone->bi_sector = sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+ clone->bi_rw = bio->bi_rw;
clone->bi_vcnt = 1;
clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
- clone->bi_rw &= ~REQ_HARDBARRIER;
clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
__issue_target_request(ci, ti, request_nr, len);
}
- static int __clone_and_map_empty_barrier(struct clone_info *ci)
+ static int __clone_and_map_empty_flush(struct clone_info *ci)
{
unsigned target_nr = 0;
struct dm_target *ti;
+ BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
- ci->sector_count = 0;
-
return 0;
}
sector_t len = 0, max;
struct dm_target_io *tio;
- if (unlikely(bio_empty_barrier(bio)))
- return __clone_and_map_empty_barrier(ci);
-
if (unlikely(bio->bi_rw & REQ_DISCARD))
return __clone_and_map_discard(ci);
ci.map = dm_get_live_table(md);
if (unlikely(!ci.map)) {
- if (!(bio->bi_rw & REQ_HARDBARRIER))
- bio_io_error(bio);
- else
- if (!md->barrier_error)
- md->barrier_error = -EIO;
+ bio_io_error(bio);
return;
}
ci.md = md;
- ci.bio = bio;
ci.io = alloc_io(md);
ci.io->error = 0;
atomic_set(&ci.io->io_count, 1);
ci.io->md = md;
spin_lock_init(&ci.io->endio_lock);
ci.sector = bio->bi_sector;
- ci.sector_count = bio_sectors(bio);
- if (unlikely(bio_empty_barrier(bio)))
- ci.sector_count = 1;
ci.idx = bio->bi_idx;
start_io_acct(ci.io);
- while (ci.sector_count && !error)
- error = __clone_and_map(&ci);
+ if (bio->bi_rw & REQ_FLUSH) {
+ ci.bio = &ci.md->flush_bio;
+ ci.sector_count = 0;
+ error = __clone_and_map_empty_flush(&ci);
+ /* dec_pending submits any data associated with flush */
+ } else {
+ ci.bio = bio;
+ ci.sector_count = bio_sectors(bio);
+ while (ci.sector_count && !error)
+ error = __clone_and_map(&ci);
+ }
/* drop the extra reference count */
dec_pending(ci.io, error);
part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
part_stat_unlock();
- /*
- * If we're suspended or the thread is processing barriers
- * we have to queue this io for later.
- */
- if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
- unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+ /* if we're suspended, we have to queue this io for later */
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
up_read(&md->io_lock);
- if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
- bio_rw(bio) == READA) {
+ if (bio_rw(bio) != READA)
+ queue_io(md, bio);
+ else
bio_io_error(bio);
- return 0;
- }
-
- queue_io(md, bio);
-
return 0;
}
return _dm_request(q, bio);
}
- static bool dm_rq_is_flush_request(struct request *rq)
- {
- if (rq->cmd_flags & REQ_FLUSH)
- return true;
- else
- return false;
- }
-
void dm_dispatch_request(struct request *rq)
{
int r;
{
int r;
- if (dm_rq_is_flush_request(rq)) {
- blk_rq_init(NULL, clone);
- clone->cmd_type = REQ_TYPE_FS;
- clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
- } else {
- r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
- dm_rq_bio_constructor, tio);
- if (r)
- return r;
-
- clone->cmd = rq->cmd;
- clone->cmd_len = rq->cmd_len;
- clone->sense = rq->sense;
- clone->buffer = rq->buffer;
- }
+ r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+ dm_rq_bio_constructor, tio);
+ if (r)
+ return r;
+ clone->cmd = rq->cmd;
+ clone->cmd_len = rq->cmd_len;
+ clone->sense = rq->sense;
+ clone->buffer = rq->buffer;
clone->end_io = end_clone_request;
clone->end_io_data = tio;
struct mapped_device *md = q->queuedata;
struct request *clone;
- if (unlikely(dm_rq_is_flush_request(rq)))
- return BLKPREP_OK;
-
if (unlikely(rq->special)) {
DMWARN("Already has something in rq->special.");
return BLKPREP_KILL;
struct dm_table *map = dm_get_live_table(md);
struct dm_target *ti;
struct request *rq, *clone;
+ sector_t pos;
/*
* For suspend, check blk_queue_stopped() and increment
if (!rq)
goto plug_and_out;
- if (unlikely(dm_rq_is_flush_request(rq))) {
- BUG_ON(md->flush_request);
- md->flush_request = rq;
- blk_start_request(rq);
- queue_work(md->wq, &md->barrier_work);
- goto out;
- }
+ /* always use block 0 to find the target for flushes for now */
+ pos = 0;
+ if (!(rq->cmd_flags & REQ_FLUSH))
+ pos = blk_rq_pos(rq);
+
+ ti = dm_table_find_target(map, pos);
+ BUG_ON(!dm_target_is_valid(ti));
- ti = dm_table_find_target(map, blk_rq_pos(rq));
if (ti->type->busy && ti->type->busy(ti))
goto plug_and_out;
static const struct block_device_operations dm_blk_dops;
static void dm_wq_work(struct work_struct *work);
- static void dm_rq_barrier_work(struct work_struct *work);
static void dm_init_md_queue(struct mapped_device *md)
{
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
md->queue->unplug_fn = dm_unplug_all;
blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+ blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
}
/*
mutex_init(&md->suspend_lock);
mutex_init(&md->type_lock);
spin_lock_init(&md->deferred_lock);
- spin_lock_init(&md->barrier_error_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
- INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
init_waitqueue_head(&md->eventq);
md->disk->major = _major;
if (!md->bdev)
goto bad_bdev;
+ bio_init(&md->flush_bio);
+ md->flush_bio.bi_bdev = md->bdev;
+ md->flush_bio.bi_rw = WRITE_FLUSH;
+
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor);
blk_queue_softirq_done(md->queue, dm_softirq_done);
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
- blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
elv_register_queue(md->queue);
return r;
}
- static void dm_flush(struct mapped_device *md)
- {
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
- bio_init(&md->barrier_bio);
- md->barrier_bio.bi_bdev = md->bdev;
- md->barrier_bio.bi_rw = WRITE_BARRIER;
- __split_and_process_bio(md, &md->barrier_bio);
-
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
- }
-
- static void process_barrier(struct mapped_device *md, struct bio *bio)
- {
- md->barrier_error = 0;
-
- dm_flush(md);
-
- if (!bio_empty_barrier(bio)) {
- __split_and_process_bio(md, bio);
- /*
- * If the request isn't supported, don't waste time with
- * the second flush.
- */
- if (md->barrier_error != -EOPNOTSUPP)
- dm_flush(md);
- }
-
- if (md->barrier_error != DM_ENDIO_REQUEUE)
- bio_endio(bio, md->barrier_error);
- else {
- spin_lock_irq(&md->deferred_lock);
- bio_list_add_head(&md->deferred, bio);
- spin_unlock_irq(&md->deferred_lock);
- }
- }
-
/*
* Process the deferred bios
*/
work);
struct bio *c;
- down_write(&md->io_lock);
+ down_read(&md->io_lock);
while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
spin_lock_irq(&md->deferred_lock);
c = bio_list_pop(&md->deferred);
spin_unlock_irq(&md->deferred_lock);
- if (!c) {
- clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+ if (!c)
break;
- }
- up_write(&md->io_lock);
+ up_read(&md->io_lock);
if (dm_request_based(md))
generic_make_request(c);
- else {
- if (c->bi_rw & REQ_HARDBARRIER)
- process_barrier(md, c);
- else
- __split_and_process_bio(md, c);
- }
+ else
+ __split_and_process_bio(md, c);
- down_write(&md->io_lock);
+ down_read(&md->io_lock);
}
- up_write(&md->io_lock);
+ up_read(&md->io_lock);
}
static void dm_queue_flush(struct mapped_device *md)
queue_work(md->wq, &md->work);
}
- static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
- {
- struct dm_rq_target_io *tio = clone->end_io_data;
-
- tio->info.target_request_nr = request_nr;
- }
-
- /* Issue barrier requests to targets and wait for their completion. */
- static int dm_rq_barrier(struct mapped_device *md)
- {
- int i, j;
- struct dm_table *map = dm_get_live_table(md);
- unsigned num_targets = dm_table_get_num_targets(map);
- struct dm_target *ti;
- struct request *clone;
-
- md->barrier_error = 0;
-
- for (i = 0; i < num_targets; i++) {
- ti = dm_table_get_target(map, i);
- for (j = 0; j < ti->num_flush_requests; j++) {
- clone = clone_rq(md->flush_request, md, GFP_NOIO);
- dm_rq_set_target_request_nr(clone, j);
- atomic_inc(&md->pending[rq_data_dir(clone)]);
- map_request(ti, clone, md);
- }
- }
-
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
- dm_table_put(map);
-
- return md->barrier_error;
- }
-
- static void dm_rq_barrier_work(struct work_struct *work)
- {
- int error;
- struct mapped_device *md = container_of(work, struct mapped_device,
- barrier_work);
- struct request_queue *q = md->queue;
- struct request *rq;
- unsigned long flags;
-
- /*
- * Hold the md reference here and leave it at the last part so that
- * the md can't be deleted by device opener when the barrier request
- * completes.
- */
- dm_get(md);
-
- error = dm_rq_barrier(md);
-
- rq = md->flush_request;
- md->flush_request = NULL;
-
- if (error == DM_ENDIO_REQUEUE) {
- spin_lock_irqsave(q->queue_lock, flags);
- blk_requeue_request(q, rq);
- spin_unlock_irqrestore(q->queue_lock, flags);
- } else
- blk_end_request_all(rq, error);
-
- blk_run_queue(q);
-
- dm_put(md);
- }
-
/*
* Swap in a new table, returning the old one for the caller to destroy.
*/
*
* To get all processes out of __split_and_process_bio in dm_request,
* we take the write lock. To prevent any process from reentering
- * __split_and_process_bio from dm_request, we set
- * DMF_QUEUE_IO_TO_THREAD.
- *
- * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
- * and call flush_workqueue(md->wq). flush_workqueue will wait until
- * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
- * further calls to __split_and_process_bio from dm_wq_work.
+ * __split_and_process_bio from dm_request and quiesce the thread
+ * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+ * flush_workqueue(md->wq).
*/
down_write(&md->io_lock);
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);
/*
- * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
- * can be kicked until md->queue is stopped. So stop md->queue before
- * flushing md->wq.
+ * Stop md->queue before flushing md->wq in case request-based
+ * dm defers requests to md->wq from md->queue.
*/
if (dm_request_based(md))
stop_queue(md->queue);
#include <linux/blkdev.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/poll.h>
#include <linux/ctype.h>
#define DEBUG 0
#define dprintk(x...) ((void)(DEBUG && printk(x)))
+static DEFINE_MUTEX(md_mutex);
#ifndef MODULE
static void autostart_arrays(int part);
return 0;
}
rcu_read_lock();
- if (mddev->suspended || mddev->barrier) {
+ if (mddev->suspended) {
DEFINE_WAIT(__wait);
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE);
- if (!mddev->suspended && !mddev->barrier)
+ if (!mddev->suspended)
break;
rcu_read_unlock();
schedule();
int mddev_congested(mddev_t *mddev, int bits)
{
- if (mddev->barrier)
- return 1;
return mddev->suspended;
}
EXPORT_SYMBOL(mddev_congested);
/*
- * Generic barrier handling for md
+ * Generic flush handling for md
*/
- #define POST_REQUEST_BARRIER ((void*)1)
-
- static void md_end_barrier(struct bio *bio, int err)
+ static void md_end_flush(struct bio *bio, int err)
{
mdk_rdev_t *rdev = bio->bi_private;
mddev_t *mddev = rdev->mddev;
- if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
- set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
- if (mddev->barrier == POST_REQUEST_BARRIER) {
- /* This was a post-request barrier */
- mddev->barrier = NULL;
- wake_up(&mddev->sb_wait);
- } else
- /* The pre-request barrier has finished */
- schedule_work(&mddev->barrier_work);
+ /* The pre-request flush has finished */
+ schedule_work(&mddev->flush_work);
}
bio_put(bio);
}
- static void submit_barriers(mddev_t *mddev)
+ static void submit_flushes(mddev_t *mddev)
{
mdk_rdev_t *rdev;
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
bi = bio_alloc(GFP_KERNEL, 0);
- bi->bi_end_io = md_end_barrier;
+ bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev;
atomic_inc(&mddev->flush_pending);
- submit_bio(WRITE_BARRIER, bi);
+ submit_bio(WRITE_FLUSH, bi);
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
}
- static void md_submit_barrier(struct work_struct *ws)
+ static void md_submit_flush_data(struct work_struct *ws)
{
- mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
- struct bio *bio = mddev->barrier;
+ mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+ struct bio *bio = mddev->flush_bio;
atomic_set(&mddev->flush_pending, 1);
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- bio_endio(bio, -EOPNOTSUPP);
- else if (bio->bi_size == 0)
+ if (bio->bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio, 0);
else {
- bio->bi_rw &= ~REQ_HARDBARRIER;
+ bio->bi_rw &= ~REQ_FLUSH;
if (mddev->pers->make_request(mddev, bio))
generic_make_request(bio);
- mddev->barrier = POST_REQUEST_BARRIER;
- submit_barriers(mddev);
}
if (atomic_dec_and_test(&mddev->flush_pending)) {
- mddev->barrier = NULL;
+ mddev->flush_bio = NULL;
wake_up(&mddev->sb_wait);
}
}
- void md_barrier_request(mddev_t *mddev, struct bio *bio)
+ void md_flush_request(mddev_t *mddev, struct bio *bio)
{
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
- !mddev->barrier,
+ !mddev->flush_bio,
mddev->write_lock, /*nothing*/);
- mddev->barrier = bio;
+ mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);
atomic_set(&mddev->flush_pending, 1);
- INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
- submit_barriers(mddev);
+ submit_flushes(mddev);
if (atomic_dec_and_test(&mddev->flush_pending))
- schedule_work(&mddev->barrier_work);
+ schedule_work(&mddev->flush_work);
}
- EXPORT_SYMBOL(md_barrier_request);
+ EXPORT_SYMBOL(md_flush_request);
/* Support for plugging.
* This mirrors the plugging support in request_queue, but does not
bio_put(bio);
}
- static void super_written_barrier(struct bio *bio, int error)
- {
- struct bio *bio2 = bio->bi_private;
- mdk_rdev_t *rdev = bio2->bi_private;
- mddev_t *mddev = rdev->mddev;
-
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
- error == -EOPNOTSUPP) {
- unsigned long flags;
- /* barriers don't appear to be supported :-( */
- set_bit(BarriersNotsupp, &rdev->flags);
- mddev->barriers_work = 0;
- spin_lock_irqsave(&mddev->write_lock, flags);
- bio2->bi_next = mddev->biolist;
- mddev->biolist = bio2;
- spin_unlock_irqrestore(&mddev->write_lock, flags);
- wake_up(&mddev->sb_wait);
- bio_put(bio);
- } else {
- bio_put(bio2);
- bio->bi_private = rdev;
- super_written(bio, error);
- }
- }
-
void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page)
{
* and decrement it on completion, waking up sb_wait
* if zero is reached.
* If an error occurred, call md_error
- *
- * As we might need to resubmit the request if REQ_HARDBARRIER
- * causes ENOTSUPP, we allocate a spare bio...
*/
struct bio *bio = bio_alloc(GFP_NOIO, 1);
- int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
bio->bi_bdev = rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
- bio->bi_rw = rw;
atomic_inc(&mddev->pending_writes);
- if (!test_bit(BarriersNotsupp, &rdev->flags)) {
- struct bio *rbio;
- rw |= REQ_HARDBARRIER;
- rbio = bio_clone(bio, GFP_NOIO);
- rbio->bi_private = bio;
- rbio->bi_end_io = super_written_barrier;
- submit_bio(rw, rbio);
- } else
- submit_bio(rw, bio);
+ submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+ bio);
}
void md_super_wait(mddev_t *mddev)
{
- /* wait for all superblock writes that were scheduled to complete.
- * if any had to be retried (due to BARRIER problems), retry them
- */
+ /* wait for all superblock writes that were scheduled to complete */
DEFINE_WAIT(wq);
for(;;) {
prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
if (atomic_read(&mddev->pending_writes)==0)
break;
- while (mddev->biolist) {
- struct bio *bio;
- spin_lock_irq(&mddev->write_lock);
- bio = mddev->biolist;
- mddev->biolist = bio->bi_next ;
- bio->bi_next = NULL;
- spin_unlock_irq(&mddev->write_lock);
- submit_bio(bio->bi_rw, bio);
- }
schedule();
}
finish_wait(&mddev->sb_wait, &wq);
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 0;
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 1;
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
- mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded;
if (start_readonly && mddev->ro == 0)
mddev->recovery = 0;
mddev->in_sync = 0;
mddev->degraded = 0;
- mddev->barriers_work = 0;
mddev->safemode = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev_t *mddev = mddev_find(bdev->bd_dev);
int err;
- lock_kernel();
+ mutex_lock(&md_mutex);
if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this
* bd_disk.
/* Wait until bdev->bd_disk is definitely gone */
flush_scheduled_work();
/* Then retry the open from the top */
- unlock_kernel();
+ mutex_unlock(&md_mutex);
return -ERESTARTSYS;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
check_disk_size_change(mddev->gendisk, bdev);
out:
- unlock_kernel();
+ mutex_unlock(&md_mutex);
return err;
}
mddev_t *mddev = disk->private_data;
BUG_ON(!mddev);
- lock_kernel();
+ mutex_lock(&md_mutex);
atomic_dec(&mddev->openers);
mddev_put(mddev);
- unlock_kernel();
+ mutex_unlock(&md_mutex);
return 0;
}
#include <linux/hdreg.h>
#include <linux/async.h>
#include <linux/mutex.h>
-#include <linux/smp_lock.h>
#include <asm/ccwdev.h>
#include <asm/ebcdic.h>
*/
blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
- blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
}
/*
if (!block)
return -ENODEV;
- lock_kernel();
base = block->base;
atomic_inc(&block->open_count);
if (test_bit(DASD_FLAG_OFFLINE, &base->flags)) {
goto out;
}
- unlock_kernel();
return 0;
out:
module_put(base->discipline->owner);
unlock:
atomic_dec(&block->open_count);
- unlock_kernel();
return rc;
}
{
struct dasd_block *block = disk->private_data;
- lock_kernel();
atomic_dec(&block->open_count);
module_put(block->base->discipline->owner);
- unlock_kernel();
return 0;
}
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
- if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+ if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
break;
error = gfs2_internal_read(ip, &ra_state, buf, &pos,
sizeof(struct gfs2_rindex));
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct file_ra_state ra_state;
- u64 rgrp_count = ip->i_disksize;
+ u64 rgrp_count = i_size_read(inode);
+ struct gfs2_rgrpd *rgd;
+ unsigned int max_data = 0;
int error;
do_div(rgrp_count, sizeof(struct gfs2_rindex));
}
}
+ list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+ if (rgd->rd_data > max_data)
+ max_data = rgd->rd_data;
+ sdp->sd_max_rg_data = max_data;
sdp->sd_rindex_uptodate = 1;
return 0;
}
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct file_ra_state ra_state;
+ struct gfs2_rgrpd *rgd;
+ unsigned int max_data = 0;
int error;
file_ra_state_init(&ra_state, inode->i_mapping);
for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
/* Ignore partials */
if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
- ip->i_disksize)
+ i_size_read(inode))
break;
error = read_rindex_entry(ip, &ra_state);
if (error) {
return error;
}
}
+ list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+ if (rgd->rd_data > max_data)
+ max_data = rgd->rd_data;
+ sdp->sd_max_rg_data = max_data;
sdp->sd_rindex_uptodate = 1;
return 0;
if ((start + nr_sects) != blk) {
rv = blkdev_issue_discard(bdev, start,
nr_sects, GFP_NOFS,
- BLKDEV_IFL_WAIT |
- BLKDEV_IFL_BARRIER);
+ 0);
if (rv)
goto fail;
nr_sects = 0;
}
}
if (nr_sects) {
- rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
- BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
if (rv)
goto fail;
}
* Returns: errno
*/
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+ char *file, unsigned int line)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
return -EINVAL;
try_again:
- /* We need to hold the rindex unless the inode we're using is
- the rindex itself, in which case it's already held. */
- if (ip != GFS2_I(sdp->sd_rindex))
- error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
- error = gfs2_ri_update_special(ip);
+ if (hold_rindex) {
+ /* We need to hold the rindex unless the inode we're using is
+ the rindex itself, in which case it's already held. */
+ if (ip != GFS2_I(sdp->sd_rindex))
+ error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+ else if (!sdp->sd_rgrps) /* We may not have the rindex read
+ in, so: */
+ error = gfs2_ri_update_special(ip);
+ }
if (error)
return error;
try to free it, and try the allocation again. */
error = get_local_rgrp(ip, &unlinked, &last_unlinked);
if (error) {
- if (ip != GFS2_I(sdp->sd_rindex))
+ if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
gfs2_glock_dq_uninit(&al->al_ri_gh);
if (error != -EAGAIN)
return error;
al->al_rgd = NULL;
if (al->al_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (ip != GFS2_I(sdp->sd_rindex))
+ if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
gfs2_glock_dq_uninit(&al->al_ri_gh);
}
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct buffer_head *dibh;
struct gfs2_alloc *al = ip->i_alloc;
- struct gfs2_rgrpd *rgd = al->al_rgd;
+ struct gfs2_rgrpd *rgd;
u32 goal, blk;
u64 block;
int error;
+ /* Only happens if there is a bug in gfs2, return something distinctive
+ * to ensure that it is noticed.
+ */
+ if (al == NULL)
+ return -ECANCELED;
+
+ rgd = al->al_rgd;
+
if (rgrp_contains_block(rgd, ip->i_goal))
goal = ip->i_goal - rgd->rd_data0;
else
JBUFFER_TRACE(descriptor, "write commit block");
set_buffer_dirty(bh);
- if (journal->j_flags & JFS_BARRIER) {
- ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
-
- /*
- * Is it possible for another commit to fail at roughly
- * the same time as this one? If so, we don't want to
- * trust the barrier flag in the super, but instead want
- * to remember if we sent a barrier request
- */
- if (ret == -EOPNOTSUPP) {
- char b[BDEVNAME_SIZE];
-
- printk(KERN_WARNING
- "JBD: barrier-based sync failed on %s - "
- "disabling barriers\n",
- bdevname(journal->j_dev, b));
- spin_lock(&journal->j_state_lock);
- journal->j_flags &= ~JFS_BARRIER;
- spin_unlock(&journal->j_state_lock);
-
- /* And try again, without the barrier */
- set_buffer_uptodate(bh);
- set_buffer_dirty(bh);
- ret = sync_dirty_buffer(bh);
- }
- } else {
+ if (journal->j_flags & JFS_BARRIER)
+ ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+ else
ret = sync_dirty_buffer(bh);
- }
put_bh(bh); /* One for getblk() */
journal_put_journal_head(descriptor);
int first_tag = 0;
int tag_flag;
int i;
- int write_op = WRITE;
+ int write_op = WRITE_SYNC;
/*
* First job: lock down the current transaction and wait for
if (journal->j_flags & JBD2_BARRIER &&
!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
- ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
- if (ret == -EOPNOTSUPP) {
- printk(KERN_WARNING
- "JBD2: Disabling barriers on %s, "
- "not supported by device\n", journal->j_devname);
- write_lock(&journal->j_state_lock);
- journal->j_flags &= ~JBD2_BARRIER;
- write_unlock(&journal->j_state_lock);
-
- /* And try again, without the barrier */
- lock_buffer(bh);
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- ret = submit_bh(WRITE_SYNC_PLUG, bh);
- }
- } else {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+ else
ret = submit_bh(WRITE_SYNC_PLUG, bh);
- }
+
*cbh = bh;
return ret;
}
{
int ret = 0;
- retry:
clear_buffer_dirty(bh);
wait_on_buffer(bh);
- if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
- printk(KERN_WARNING
- "JBD2: %s: disabling barries on %s - not supported "
- "by device\n", __func__, journal->j_devname);
- write_lock(&journal->j_state_lock);
- journal->j_flags &= ~JBD2_BARRIER;
- write_unlock(&journal->j_state_lock);
-
- lock_buffer(bh);
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- bh->b_end_io = journal_end_buffer_io_sync;
-
- ret = submit_bh(WRITE_SYNC_PLUG, bh);
- if (ret) {
- unlock_buffer(bh);
- return ret;
- }
- goto retry;
- }
if (unlikely(!buffer_uptodate(bh)))
ret = -EIO;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
- int write_op = WRITE;
+ int write_op = WRITE_SYNC;
/*
* First job: lock down the current transaction and wait for
}
}
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err) {
+ printk(KERN_WARNING
+ "JBD2: Detected IO errors while flushing file data "
+ "on %s\n", journal->j_devname);
+ if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+ jbd2_journal_abort(journal, err);
+ err = 0;
+ }
+
/*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
if (commit_transaction->t_flushed_data_blocks &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
- BLKDEV_IFL_WAIT);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
- if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
- BLKDEV_IFL_WAIT);
- }
-
- err = journal_finish_inode_data_buffers(journal, commit_transaction);
- if (err) {
- printk(KERN_WARNING
- "JBD2: Detected IO errors while flushing file data "
- "on %s\n", journal->j_devname);
- if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
- jbd2_journal_abort(journal, err);
- err = 0;
}
/* Lo and behold: we have just managed to send a transaction to
}
if (!err && !is_journal_aborted(journal))
err = journal_wait_on_commit_record(journal, cbh);
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+ journal->j_flags & JBD2_BARRIER) {
+ blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+ }
if (err)
jbd2_journal_abort(journal, err);
#include <linux/parser.h>
#include <linux/random.h>
#include <linux/crc32.h>
-#include <linux/smp_lock.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
#include <linux/kobject.h>
retry:
set_buffer_dirty(nilfs->ns_sbh[0]);
-
if (nilfs_test_opt(sbi, BARRIER)) {
err = __sync_dirty_buffer(nilfs->ns_sbh[0],
- WRITE_SYNC | WRITE_BARRIER);
- if (err == -EOPNOTSUPP) {
- nilfs_warning(sbi->s_super, __func__,
- "barrier-based sync failed. "
- "disabling barriers\n");
- nilfs_clear_opt(sbi, BARRIER);
- goto retry;
- }
+ WRITE_SYNC | WRITE_FLUSH_FUA);
} else {
err = sync_dirty_buffer(nilfs->ns_sbh[0]);
}
struct nilfs_sb_info *sbi = NILFS_SB(sb);
struct the_nilfs *nilfs = sbi->s_nilfs;
- lock_kernel();
-
nilfs_detach_segment_constructor(sbi);
if (!(sb->s_flags & MS_RDONLY)) {
sbi->s_super = NULL;
sb->s_fs_info = NULL;
nilfs_put_sbinfo(sbi);
-
- unlock_kernel();
}
static int nilfs_sync_fs(struct super_block *sb, int wait)
struct nilfs_mount_options old_opts;
int was_snapshot, err;
- lock_kernel();
-
down_write(&nilfs->ns_super_sem);
old_sb_flags = sb->s_flags;
old_opts.mount_opt = sbi->s_mount_opt;
}
out:
up_write(&nilfs->ns_super_sem);
- unlock_kernel();
return 0;
restore_opts:
sbi->s_mount_opt = old_opts.mount_opt;
sbi->s_snapshot_cno = old_opts.snapshot_cno;
up_write(&nilfs->ns_super_sem);
- unlock_kernel();
return err;
}
put_nilfs(nilfs);
failed:
close_bdev_exclusive(sd.bdev, mode);
-
return err;
cancel_new:
#define BIO_NULL_MAPPED 9 /* contains invalid user pages */
#define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */
#define BIO_QUIET 11 /* Make BIO Quiet */
+#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/*
/* bio only flags */
__REQ_UNPLUG, /* unplug the immediately after submission */
__REQ_RAHEAD, /* read ahead, can fail anytime */
+ __REQ_THROTTLED, /* This bio has already been subjected to
+ * throttling rules. Don't do it again. */
/* request only flags */
__REQ_SORTED, /* elevator knows about this request */
__REQ_FAILED, /* set if the request failed */
__REQ_QUIET, /* don't worry about errors */
__REQ_PREEMPT, /* set for "ide_preempt" requests */
- __REQ_ORDERED_COLOR, /* is before or after barrier */
__REQ_ALLOCED, /* request came from our alloc pool */
__REQ_COPY_USER, /* contains copies of user pages */
- __REQ_INTEGRITY, /* integrity metadata has been remapped */
__REQ_FLUSH, /* request for cache flush */
__REQ_IO_STAT, /* account I/O stat */
__REQ_MIXED_MERGE, /* merge of different types, fail separately */
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
#define REQ_COMMON_MASK \
(REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
- REQ_META| REQ_DISCARD | REQ_NOIDLE)
+ REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+ #define REQ_CLONE_MASK REQ_COMMON_MASK
#define REQ_UNPLUG (1 << __REQ_UNPLUG)
#define REQ_RAHEAD (1 << __REQ_RAHEAD)
+#define REQ_THROTTLED (1 << __REQ_THROTTLED)
#define REQ_SORTED (1 << __REQ_SORTED)
#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER)
#define REQ_FAILED (1 << __REQ_FAILED)
#define REQ_QUIET (1 << __REQ_QUIET)
#define REQ_PREEMPT (1 << __REQ_PREEMPT)
- #define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
#define REQ_ALLOCED (1 << __REQ_ALLOCED)
#define REQ_COPY_USER (1 << __REQ_COPY_USER)
-#define REQ_INTEGRITY (1 << __REQ_INTEGRITY)
#define REQ_FLUSH (1 << __REQ_FLUSH)
#define REQ_IO_STAT (1 << __REQ_IO_STAT)
#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
void *elevator_private3;
struct gendisk *rq_disk;
+ struct hd_struct *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
unsigned long long start_time_ns;
* physical address coalescing is performed.
*/
unsigned short nr_phys_segments;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ unsigned short nr_integrity_segments;
+#endif
unsigned short ioprio;
unsigned short logical_block_size;
unsigned short max_segments;
+ unsigned short max_integrity_segments;
unsigned char misaligned;
unsigned char discard_misaligned;
struct blk_trace *blk_trace;
#endif
/*
- * reserved for flush operations
+ * for flush operations
*/
- unsigned int ordered, next_ordered, ordseq;
- int orderr, ordcolor;
- struct request pre_flush_rq, bar_rq, post_flush_rq;
- struct request *orig_bar_rq;
+ unsigned int flush_flags;
+ unsigned int flush_seq;
+ int flush_err;
+ struct request flush_rq;
+ struct request *orig_flush_rq;
+ struct list_head pending_flushes;
struct mutex sysfs_lock;
#if defined(CONFIG_BLK_DEV_BSG)
struct bsg_class_device bsg_dev;
#endif
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+ /* Throttle data */
+ struct throtl_data *td;
+#endif
};
#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */
__clear_bit(flag, &q->queue_flags);
}
- enum {
- /*
- * Hardbarrier is supported with one of the following methods.
- *
- * NONE : hardbarrier unsupported
- * DRAIN : ordering by draining is enough
- * DRAIN_FLUSH : ordering by draining w/ pre and post flushes
- * DRAIN_FUA : ordering by draining w/ pre flush and FUA write
- * TAG : ordering by tag is enough
- * TAG_FLUSH : ordering by tag w/ pre and post flushes
- * TAG_FUA : ordering by tag w/ pre flush and FUA write
- */
- QUEUE_ORDERED_BY_DRAIN = 0x01,
- QUEUE_ORDERED_BY_TAG = 0x02,
- QUEUE_ORDERED_DO_PREFLUSH = 0x10,
- QUEUE_ORDERED_DO_BAR = 0x20,
- QUEUE_ORDERED_DO_POSTFLUSH = 0x40,
- QUEUE_ORDERED_DO_FUA = 0x80,
-
- QUEUE_ORDERED_NONE = 0x00,
-
- QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN |
- QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_FUA,
-
- QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG |
- QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_FUA,
-
- /*
- * Ordered operation sequence
- */
- QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */
- QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */
- QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */
- QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */
- QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */
- QUEUE_ORDSEQ_DONE = 0x20,
- };
-
#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
- #define blk_queue_flushing(q) ((q)->ordseq)
#define blk_queue_stackable(q) \
test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
* it already be started by driver.
*/
#define RQ_NOMERGE_FLAGS \
- (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
+ (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
+ REQ_FLUSH | REQ_FUA)
#define rq_mergeable(rq) \
(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
(((rq)->cmd_flags & REQ_DISCARD) || \
extern void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
-extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
+ extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
- extern int blk_queue_ordered(struct request_queue *, unsigned);
- extern bool blk_do_ordered(struct request_queue *, struct request **);
- extern unsigned blk_ordered_cur_seq(struct request_queue *);
- extern unsigned blk_ordered_req_seq(struct request *);
- extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
return NULL;
return bqt->tag_index[tag];
}
- enum{
- BLKDEV_WAIT, /* wait for completion */
- BLKDEV_BARRIER, /* issue request with barrier */
- BLKDEV_SECURE, /* secure discard */
- };
- #define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT)
- #define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER)
- #define BLKDEV_IFL_SECURE (1 << BLKDEV_SECURE)
- extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
- unsigned long);
+
+ #define BLKDEV_DISCARD_SECURE 0x01 /* secure discard */
+
+ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
- static inline int sb_issue_discard(struct super_block *sb,
- sector_t block, sector_t nr_blocks)
+ sector_t nr_sects, gfp_t gfp_mask);
+ static inline int sb_issue_discard(struct super_block *sb, sector_t block,
+ sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
- block <<= (sb->s_blocksize_bits - 9);
- nr_blocks <<= (sb->s_blocksize_bits - 9);
- return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
- BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
+ nr_blocks << (sb->s_blocksize_bits - 9),
+ gfp_mask, flags);
}
extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
return q->limits.physical_block_size;
}
-static inline int bdev_physical_block_size(struct block_device *bdev)
+static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
{
return queue_physical_block_size(bdev_get_queue(bdev));
}
return q ? q->dma_alignment : 511;
}
-static inline int blk_rq_aligned(struct request_queue *q, void *addr,
+static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
unsigned int len)
{
unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
- return !((unsigned long)addr & alignment) && !(len & alignment);
+ return !(addr & alignment) && !(len & alignment);
}
/* assumes size > 256 */
struct work_struct;
int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
#ifdef CONFIG_BLK_CGROUP
/*
}
#endif
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
+extern void throtl_shutdown_timer_wq(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+{
+ return 0;
+}
+
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
+static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
+static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
#define MODULE_ALIAS_BLOCKDEV(major,minor) \
MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
extern void blk_integrity_unregister(struct gendisk *);
extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request *);
+extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+ struct scatterlist *);
+extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
+ struct request *);
+extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
+ struct bio *);
static inline
struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
return bio_integrity(rq->bio);
}
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+ unsigned int segs)
+{
+ q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(struct request_queue *q)
+{
+ return q->limits.max_integrity_segments;
+}
+
#else /* CONFIG_BLK_DEV_INTEGRITY */
#define blk_integrity_rq(rq) (0)
-#define blk_rq_count_integrity_sg(a) (0)
-#define blk_rq_map_integrity_sg(a, b) (0)
+#define blk_rq_count_integrity_sg(a, b) (0)
+#define blk_rq_map_integrity_sg(a, b, c) (0)
#define bdev_get_integrity(a) (0)
#define blk_get_integrity(a) (0)
#define blk_integrity_compare(a, b) (0)
#define blk_integrity_register(a, b) (0)
#define blk_integrity_unregister(a) do { } while (0);
+#define blk_queue_max_integrity_segments(a, b) do { } while (0);
+#define queue_max_integrity_segments(a) (0)
+#define blk_integrity_merge_rq(a, b, c) (0)
+#define blk_integrity_merge_bio(a, b, c) (0)
#endif /* CONFIG_BLK_DEV_INTEGRITY */
* immediately after submission. The write equivalent
* of READ_SYNC.
* WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
- * WRITE_BARRIER Like WRITE_SYNC, but tells the block layer that all
- * previously submitted writes must be safely on storage
- * before this one is started. Also guarantees that when
- * this write is complete, it itself is also safely on
- * storage. Prevents reordering of writes on both sides
- * of this IO.
+ * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush.
+ * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on
+ * non-volatile media on completion.
+ * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
+ * by a cache flush and data is guaranteed to be on
+ * non-volatile media on completion.
*
*/
#define RW_MASK REQ_WRITE
#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC)
#define WRITE_META (WRITE | REQ_META)
- #define WRITE_BARRIER (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
- REQ_HARDBARRIER)
-
- /*
- * These aren't really reads or writes, they pass down information about
- * parts of device that are now unused by the file system.
- */
- #define DISCARD_NOBARRIER (WRITE | REQ_DISCARD)
- #define DISCARD_BARRIER (WRITE | REQ_DISCARD | REQ_HARDBARRIER)
- #define DISCARD_SECURE (DISCARD_NOBARRIER | REQ_SECURE)
+ #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+ REQ_FLUSH)
+ #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+ REQ_FUA)
+ #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+ REQ_FLUSH | REQ_FUA)
#define SEL_IN 1
#define SEL_OUT 2
#include <linux/fcntl.h>
-/* temporary stubs for BKL removal */
-#define lock_flocks() lock_kernel()
-#define unlock_flocks() unlock_kernel()
-
extern void send_sigio(struct fown_struct *fown, int fd, int band);
#ifdef CONFIG_FILE_LOCKING
extern int lease_modify(struct file_lock **, int);
extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+extern void lock_flocks(void);
+extern void unlock_flocks(void);
#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, struct flock __user *user)
{
return 1;
}
+static inline void lock_flocks(void)
+{
+}
+
+static inline void unlock_flocks(void)
+{
+}
+
#endif /* !CONFIG_FILE_LOCKING */
* Saved mount options for lazy filesystems using
* generic_show_options()
*/
- char *s_options;
+ char __rcu *s_options;
};
extern struct timespec current_fs_time(struct super_block *sb);
extern int generic_file_fsync(struct file *, int);
+extern int generic_check_addressable(unsigned, u64);
+
#ifdef CONFIG_MIGRATION
extern int buffer_migrate_page(struct address_space *,
struct page *, struct page *);
.release = simple_attr_release, \
.read = simple_attr_read, \
.write = simple_attr_write, \
+ .llseek = generic_file_llseek, \
};
static inline void __attribute__((format(printf, 1, 2)))