#include <unistd.h>
#include <errno.h>
#include <time.h>
-#include <signal.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include "qemu-queue.h"
#include "osdep.h"
+#include "sysemu.h"
#include "qemu-common.h"
+#include "trace.h"
#include "block_int.h"
#include "block/raw-posix-aio.h"
+static void do_spawn_thread(void);
struct qemu_paiocb {
BlockDriverAIOCB common;
int aio_fildes;
union {
struct iovec *aio_iov;
- void *aio_ioctl_buf;
+ void *aio_ioctl_buf;
};
int aio_niov;
size_t aio_nbytes;
#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */
- int ev_signo;
off_t aio_offset;
QTAILQ_ENTRY(qemu_paiocb) node;
ssize_t ret;
int active;
struct qemu_paiocb *next;
-
- int async_context_id;
};
typedef struct PosixAioState {
static int max_threads = 64;
static int cur_threads = 0;
static int idle_threads = 0;
+static int new_threads = 0; /* backlog of threads we need to create */
+static int pending_threads = 0; /* threads created but not running yet */
+static QEMUBH *new_thread_bh;
static QTAILQ_HEAD(, qemu_paiocb) request_list;
#ifdef CONFIG_PREADV
if (ret) die2(ret, "pthread_create");
}
-static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
+static ssize_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
{
- int ret;
+ int ret;
- ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
- if (ret == -1)
- return -errno;
+ ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
+ if (ret == -1)
+ return -errno;
- /*
- * This looks weird, but the aio code only consideres a request
- * successfull if it has written the number full number of bytes.
- *
- * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
- * so in fact we return the ioctl command here to make posix_aio_read()
- * happy..
- */
- return aiocb->aio_nbytes;
+ /*
+ * This looks weird, but the aio code only consideres a request
+ * successful if it has written the number full number of bytes.
+ *
+ * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
+ * so in fact we return the ioctl command here to make posix_aio_read()
+ * happy..
+ */
+ return aiocb->aio_nbytes;
}
-static size_t handle_aiocb_flush(struct qemu_paiocb *aiocb)
+static ssize_t handle_aiocb_flush(struct qemu_paiocb *aiocb)
{
int ret;
#endif
-static size_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
+static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
{
- size_t offset = 0;
ssize_t len;
do {
len = qemu_pwritev(aiocb->aio_fildes,
aiocb->aio_iov,
aiocb->aio_niov,
- aiocb->aio_offset + offset);
+ aiocb->aio_offset);
else
len = qemu_preadv(aiocb->aio_fildes,
aiocb->aio_iov,
aiocb->aio_niov,
- aiocb->aio_offset + offset);
+ aiocb->aio_offset);
} while (len == -1 && errno == EINTR);
if (len == -1)
return len;
}
-static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
+static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
{
- size_t offset = 0;
- size_t len;
+ ssize_t offset = 0;
+ ssize_t len;
while (offset < aiocb->aio_nbytes) {
if (aiocb->aio_type & QEMU_AIO_WRITE)
return offset;
}
-static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
+static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
{
- size_t nbytes;
+ ssize_t nbytes;
char *buf;
if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
* Try preadv/pwritev first and fall back to linearizing the
* buffer if it's not supported.
*/
- if (preadv_present) {
+ if (preadv_present) {
nbytes = handle_aiocb_rw_vector(aiocb);
if (nbytes == aiocb->aio_nbytes)
- return nbytes;
+ return nbytes;
if (nbytes < 0 && nbytes != -ENOSYS)
return nbytes;
preadv_present = 0;
* Ok, we have to do it the hard way, copy all segments into
* a single aligned buffer.
*/
- buf = qemu_memalign(512, aiocb->aio_nbytes);
+ buf = qemu_blockalign(aiocb->common.bs, aiocb->aio_nbytes);
if (aiocb->aio_type & QEMU_AIO_WRITE) {
char *p = buf;
int i;
return nbytes;
}
+static void posix_aio_notify_event(void);
+
static void *aio_thread(void *unused)
{
- pid_t pid;
-
- pid = getpid();
+ mutex_lock(&lock);
+ pending_threads--;
+ mutex_unlock(&lock);
+ do_spawn_thread();
while (1) {
struct qemu_paiocb *aiocb;
- size_t ret = 0;
+ ssize_t ret = 0;
qemu_timeval tv;
struct timespec ts;
while (QTAILQ_EMPTY(&request_list) &&
!(ret == ETIMEDOUT)) {
+ idle_threads++;
ret = cond_timedwait(&cond, &lock, &ts);
+ idle_threads--;
}
if (QTAILQ_EMPTY(&request_list))
aiocb = QTAILQ_FIRST(&request_list);
QTAILQ_REMOVE(&request_list, aiocb, node);
aiocb->active = 1;
- idle_threads--;
mutex_unlock(&lock);
switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
case QEMU_AIO_READ:
+ ret = handle_aiocb_rw(aiocb);
+ if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) {
+ /* A short read means that we have reached EOF. Pad the buffer
+ * with zeros for bytes after EOF. */
+ QEMUIOVector qiov;
+
+ qemu_iovec_init_external(&qiov, aiocb->aio_iov,
+ aiocb->aio_niov);
+ qemu_iovec_memset_skip(&qiov, 0, aiocb->aio_nbytes - ret, ret);
+
+ ret = aiocb->aio_nbytes;
+ }
+ break;
case QEMU_AIO_WRITE:
- ret = handle_aiocb_rw(aiocb);
- break;
+ ret = handle_aiocb_rw(aiocb);
+ break;
case QEMU_AIO_FLUSH:
- ret = handle_aiocb_flush(aiocb);
- break;
+ ret = handle_aiocb_flush(aiocb);
+ break;
case QEMU_AIO_IOCTL:
- ret = handle_aiocb_ioctl(aiocb);
- break;
- default:
- fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
- ret = -EINVAL;
- break;
- }
+ ret = handle_aiocb_ioctl(aiocb);
+ break;
+ default:
+ fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
+ ret = -EINVAL;
+ break;
+ }
mutex_lock(&lock);
aiocb->ret = ret;
- idle_threads++;
mutex_unlock(&lock);
- if (kill(pid, aiocb->ev_signo)) die("kill failed");
+ posix_aio_notify_event();
}
- idle_threads--;
cur_threads--;
mutex_unlock(&lock);
return NULL;
}
-static void spawn_thread(void)
+static void do_spawn_thread(void)
{
sigset_t set, oldset;
- cur_threads++;
- idle_threads++;
+ mutex_lock(&lock);
+ if (!new_threads) {
+ mutex_unlock(&lock);
+ return;
+ }
+
+ new_threads--;
+ pending_threads++;
+
+ mutex_unlock(&lock);
/* block all signals */
if (sigfillset(&set)) die("sigfillset");
if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
}
+static void spawn_thread_bh_fn(void *opaque)
+{
+ do_spawn_thread();
+}
+
+static void spawn_thread(void)
+{
+ cur_threads++;
+ new_threads++;
+ /* If there are threads being created, they will spawn new workers, so
+ * we don't spend time creating many threads in a loop holding a mutex or
+ * starving the current vcpu.
+ *
+ * If there are no idle threads, ask the main thread to create one, so we
+ * inherit the correct affinity instead of the vcpu affinity.
+ */
+ if (!pending_threads) {
+ qemu_bh_schedule(new_thread_bh);
+ }
+}
+
static void qemu_paio_submit(struct qemu_paiocb *aiocb)
{
aiocb->ret = -EINPROGRESS;
struct qemu_paiocb *acb, **pacb;
int ret;
int result = 0;
- int async_context_id = get_async_context_id();
for(;;) {
pacb = &s->first_aio;
if (!acb)
return result;
- /* we're only interested in requests in the right context */
- if (acb->async_context_id != async_context_id) {
- pacb = &acb->next;
- continue;
- }
-
ret = qemu_paio_error(acb);
if (ret == ECANCELED) {
/* remove the request */
} else {
ret = -ret;
}
+
+ trace_paio_complete(acb, acb->common.opaque, ret);
+
/* remove the request */
*pacb = acb->next;
/* call the callback */
static PosixAioState *posix_aio_state;
-static void aio_signal_handler(int signum)
+static void posix_aio_notify_event(void)
{
- if (posix_aio_state) {
- char byte = 0;
-
- write(posix_aio_state->wfd, &byte, sizeof(byte));
- }
+ char byte = 0;
+ ssize_t ret;
- qemu_service_io();
+ ret = write(posix_aio_state->wfd, &byte, sizeof(byte));
+ if (ret < 0 && errno != EAGAIN)
+ die("write()");
}
static void paio_remove(struct qemu_paiocb *acb)
struct qemu_paiocb *acb = (struct qemu_paiocb *)blockacb;
int active = 0;
+ trace_paio_cancel(acb, acb->common.opaque);
+
mutex_lock(&lock);
if (!acb->active) {
QTAILQ_REMOVE(&request_list, acb, node);
return NULL;
acb->aio_type = type;
acb->aio_fildes = fd;
- acb->ev_signo = SIGUSR2;
- acb->async_context_id = get_async_context_id();
if (qiov) {
acb->aio_iov = qiov->iov;
acb->next = posix_aio_state->first_aio;
posix_aio_state->first_aio = acb;
+ trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
qemu_paio_submit(acb);
return &acb->common;
}
return NULL;
acb->aio_type = QEMU_AIO_IOCTL;
acb->aio_fildes = fd;
- acb->ev_signo = SIGUSR2;
acb->aio_offset = 0;
acb->aio_ioctl_buf = buf;
acb->aio_ioctl_cmd = req;
int paio_init(void)
{
- struct sigaction act;
PosixAioState *s;
int fds[2];
int ret;
if (posix_aio_state)
return 0;
- s = qemu_malloc(sizeof(PosixAioState));
-
- sigfillset(&act.sa_mask);
- act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
- act.sa_handler = aio_signal_handler;
- sigaction(SIGUSR2, &act, NULL);
+ s = g_malloc(sizeof(PosixAioState));
s->first_aio = NULL;
- if (pipe(fds) == -1) {
+ if (qemu_pipe(fds) == -1) {
fprintf(stderr, "failed to create pipe\n");
+ g_free(s);
return -1;
}
die2(ret, "pthread_attr_setdetachstate");
QTAILQ_INIT(&request_list);
+ new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
posix_aio_state = s;
return 0;