2 * QEMU posix-aio emulation
4 * Copyright IBM, Corp. 2008
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
14 #include <sys/ioctl.h>
23 #include "qemu-common.h"
25 #include "posix-aio-compat.h"
27 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
28 static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
29 static pthread_t thread_id;
30 static pthread_attr_t attr;
31 static int max_threads = 64;
32 static int cur_threads = 0;
33 static int idle_threads = 0;
34 static TAILQ_HEAD(, qemu_paiocb) request_list;
37 static int preadv_present = 1;
39 static int preadv_present = 0;
42 static void die2(int err, const char *what)
44 fprintf(stderr, "%s failed: %s\n", what, strerror(err));
48 static void die(const char *what)
53 static void mutex_lock(pthread_mutex_t *mutex)
55 int ret = pthread_mutex_lock(mutex);
56 if (ret) die2(ret, "pthread_mutex_lock");
59 static void mutex_unlock(pthread_mutex_t *mutex)
61 int ret = pthread_mutex_unlock(mutex);
62 if (ret) die2(ret, "pthread_mutex_unlock");
65 static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
68 int ret = pthread_cond_timedwait(cond, mutex, ts);
69 if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait");
73 static void cond_signal(pthread_cond_t *cond)
75 int ret = pthread_cond_signal(cond);
76 if (ret) die2(ret, "pthread_cond_signal");
79 static void thread_create(pthread_t *thread, pthread_attr_t *attr,
80 void *(*start_routine)(void*), void *arg)
82 int ret = pthread_create(thread, attr, start_routine, arg);
83 if (ret) die2(ret, "pthread_create");
86 static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
90 ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
95 * This looks weird, but the aio code only consideres a request
96 * successfull if it has written the number full number of bytes.
98 * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
99 * so in fact we return the ioctl command here to make posix_aio_read()
102 return aiocb->aio_nbytes;
108 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
110 return preadv(fd, iov, nr_iov, offset);
114 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
116 return pwritev(fd, iov, nr_iov, offset);
122 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
128 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
136 * Check if we need to copy the data in the aiocb into a new
137 * properly aligned buffer.
139 static int aiocb_needs_copy(struct qemu_paiocb *aiocb)
141 if (aiocb->aio_flags & QEMU_AIO_SECTOR_ALIGNED) {
144 for (i = 0; i < aiocb->aio_niov; i++)
145 if ((uintptr_t) aiocb->aio_iov[i].iov_base % 512)
152 static size_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
158 if (aiocb->aio_type == QEMU_PAIO_WRITE)
159 len = qemu_pwritev(aiocb->aio_fildes,
162 aiocb->aio_offset + offset);
164 len = qemu_preadv(aiocb->aio_fildes,
167 aiocb->aio_offset + offset);
168 } while (len == -1 && errno == EINTR);
175 static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
180 while (offset < aiocb->aio_nbytes) {
181 if (aiocb->aio_type == QEMU_PAIO_WRITE)
182 len = pwrite(aiocb->aio_fildes,
183 (const char *)buf + offset,
184 aiocb->aio_nbytes - offset,
185 aiocb->aio_offset + offset);
187 len = pread(aiocb->aio_fildes,
189 aiocb->aio_nbytes - offset,
190 aiocb->aio_offset + offset);
192 if (len == -1 && errno == EINTR)
194 else if (len == -1) {
206 static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
211 if (!aiocb_needs_copy(aiocb)) {
213 * If there is just a single buffer, and it is properly aligned
214 * we can just use plain pread/pwrite without any problems.
216 if (aiocb->aio_niov == 1)
217 return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
220 * We have more than one iovec, and all are properly aligned.
222 * Try preadv/pwritev first and fall back to linearizing the
223 * buffer if it's not supported.
225 if (preadv_present) {
226 nbytes = handle_aiocb_rw_vector(aiocb);
227 if (nbytes == aiocb->aio_nbytes)
229 if (nbytes < 0 && nbytes != -ENOSYS)
235 * XXX(hch): short read/write. no easy way to handle the reminder
236 * using these interfaces. For now retry using plain
242 * Ok, we have to do it the hard way, copy all segments into
243 * a single aligned buffer.
245 buf = qemu_memalign(512, aiocb->aio_nbytes);
246 if (aiocb->aio_type == QEMU_PAIO_WRITE) {
250 for (i = 0; i < aiocb->aio_niov; ++i) {
251 memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
252 p += aiocb->aio_iov[i].iov_len;
256 nbytes = handle_aiocb_rw_linear(aiocb, buf);
257 if (aiocb->aio_type != QEMU_PAIO_WRITE) {
259 size_t count = aiocb->aio_nbytes, copy;
262 for (i = 0; i < aiocb->aio_niov && count; ++i) {
264 if (copy > aiocb->aio_iov[i].iov_len)
265 copy = aiocb->aio_iov[i].iov_len;
266 memcpy(aiocb->aio_iov[i].iov_base, p, copy);
276 static void *aio_thread(void *unused)
283 /* block all signals */
284 if (sigfillset(&set)) die("sigfillset");
285 if (sigprocmask(SIG_BLOCK, &set, NULL)) die("sigprocmask");
288 struct qemu_paiocb *aiocb;
293 qemu_gettimeofday(&tv);
294 ts.tv_sec = tv.tv_sec + 10;
299 while (TAILQ_EMPTY(&request_list) &&
300 !(ret == ETIMEDOUT)) {
301 ret = cond_timedwait(&cond, &lock, &ts);
304 if (TAILQ_EMPTY(&request_list))
307 aiocb = TAILQ_FIRST(&request_list);
308 TAILQ_REMOVE(&request_list, aiocb, node);
313 switch (aiocb->aio_type) {
315 case QEMU_PAIO_WRITE:
316 ret = handle_aiocb_rw(aiocb);
318 case QEMU_PAIO_IOCTL:
319 ret = handle_aiocb_ioctl(aiocb);
322 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
332 if (kill(pid, aiocb->ev_signo)) die("kill failed");
342 static void spawn_thread(void)
346 thread_create(&thread_id, &attr, aio_thread, NULL);
349 int qemu_paio_init(struct qemu_paioinit *aioinit)
353 ret = pthread_attr_init(&attr);
354 if (ret) die2(ret, "pthread_attr_init");
356 ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
357 if (ret) die2(ret, "pthread_attr_setdetachstate");
359 TAILQ_INIT(&request_list);
364 static int qemu_paio_submit(struct qemu_paiocb *aiocb, int type)
366 aiocb->aio_type = type;
367 aiocb->ret = -EINPROGRESS;
370 if (idle_threads == 0 && cur_threads < max_threads)
372 TAILQ_INSERT_TAIL(&request_list, aiocb, node);
379 int qemu_paio_read(struct qemu_paiocb *aiocb)
381 return qemu_paio_submit(aiocb, QEMU_PAIO_READ);
384 int qemu_paio_write(struct qemu_paiocb *aiocb)
386 return qemu_paio_submit(aiocb, QEMU_PAIO_WRITE);
389 int qemu_paio_ioctl(struct qemu_paiocb *aiocb)
391 return qemu_paio_submit(aiocb, QEMU_PAIO_IOCTL);
394 ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
405 int qemu_paio_error(struct qemu_paiocb *aiocb)
407 ssize_t ret = qemu_paio_return(aiocb);
417 int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb)
422 if (!aiocb->active) {
423 TAILQ_REMOVE(&request_list, aiocb, node);
424 aiocb->ret = -ECANCELED;
425 ret = QEMU_PAIO_CANCELED;
426 } else if (aiocb->ret == -EINPROGRESS)
427 ret = QEMU_PAIO_NOTCANCELED;
429 ret = QEMU_PAIO_ALLDONE;