]>
Commit | Line | Data |
---|---|---|
2b188cc1 JA |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Shared application/kernel submission and completion ring pairs, for | |
4 | * supporting fast/efficient IO. | |
5 | * | |
6 | * A note on the read/write ordering memory barriers that are matched between | |
7 | * the application and kernel side. When the application reads the CQ ring | |
8 | * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() | |
9 | * the kernel uses after writing the tail. Failure to do so could cause a | |
10 | * delay in when the application notices that completion events available. | |
11 | * This isn't a fatal condition. Likewise, the application must use an | |
12 | * appropriate smp_wmb() both before writing the SQ tail, and after writing | |
13 | * the SQ tail. The first one orders the sqe writes with the tail write, and | |
14 | * the latter is paired with the smp_rmb() the kernel will issue before | |
15 | * reading the SQ tail on submission. | |
16 | * | |
17 | * Also see the examples in the liburing library: | |
18 | * | |
19 | * git://git.kernel.dk/liburing | |
20 | * | |
21 | * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens | |
22 | * from data shared between the kernel and application. This is done both | |
23 | * for ordering purposes, but also to ensure that once a value is loaded from | |
24 | * data that the application could potentially modify, it remains stable. | |
25 | * | |
26 | * Copyright (C) 2018-2019 Jens Axboe | |
27 | */ | |
28 | #include <linux/kernel.h> | |
29 | #include <linux/init.h> | |
30 | #include <linux/errno.h> | |
31 | #include <linux/syscalls.h> | |
32 | #include <linux/compat.h> | |
33 | #include <linux/refcount.h> | |
34 | #include <linux/uio.h> | |
35 | ||
36 | #include <linux/sched/signal.h> | |
37 | #include <linux/fs.h> | |
38 | #include <linux/file.h> | |
39 | #include <linux/fdtable.h> | |
40 | #include <linux/mm.h> | |
41 | #include <linux/mman.h> | |
42 | #include <linux/mmu_context.h> | |
43 | #include <linux/percpu.h> | |
44 | #include <linux/slab.h> | |
45 | #include <linux/workqueue.h> | |
46 | #include <linux/blkdev.h> | |
47 | #include <linux/net.h> | |
48 | #include <net/sock.h> | |
49 | #include <net/af_unix.h> | |
50 | #include <linux/anon_inodes.h> | |
51 | #include <linux/sched/mm.h> | |
52 | #include <linux/uaccess.h> | |
53 | #include <linux/nospec.h> | |
54 | ||
55 | #include <uapi/linux/io_uring.h> | |
56 | ||
57 | #include "internal.h" | |
58 | ||
59 | #define IORING_MAX_ENTRIES 4096 | |
60 | ||
61 | struct io_uring { | |
62 | u32 head ____cacheline_aligned_in_smp; | |
63 | u32 tail ____cacheline_aligned_in_smp; | |
64 | }; | |
65 | ||
66 | struct io_sq_ring { | |
67 | struct io_uring r; | |
68 | u32 ring_mask; | |
69 | u32 ring_entries; | |
70 | u32 dropped; | |
71 | u32 flags; | |
72 | u32 array[]; | |
73 | }; | |
74 | ||
75 | struct io_cq_ring { | |
76 | struct io_uring r; | |
77 | u32 ring_mask; | |
78 | u32 ring_entries; | |
79 | u32 overflow; | |
80 | struct io_uring_cqe cqes[]; | |
81 | }; | |
82 | ||
83 | struct io_ring_ctx { | |
84 | struct { | |
85 | struct percpu_ref refs; | |
86 | } ____cacheline_aligned_in_smp; | |
87 | ||
88 | struct { | |
89 | unsigned int flags; | |
90 | bool compat; | |
91 | bool account_mem; | |
92 | ||
93 | /* SQ ring */ | |
94 | struct io_sq_ring *sq_ring; | |
95 | unsigned cached_sq_head; | |
96 | unsigned sq_entries; | |
97 | unsigned sq_mask; | |
98 | struct io_uring_sqe *sq_sqes; | |
99 | } ____cacheline_aligned_in_smp; | |
100 | ||
101 | /* IO offload */ | |
102 | struct workqueue_struct *sqo_wq; | |
103 | struct mm_struct *sqo_mm; | |
104 | ||
105 | struct { | |
106 | /* CQ ring */ | |
107 | struct io_cq_ring *cq_ring; | |
108 | unsigned cached_cq_tail; | |
109 | unsigned cq_entries; | |
110 | unsigned cq_mask; | |
111 | struct wait_queue_head cq_wait; | |
112 | struct fasync_struct *cq_fasync; | |
113 | } ____cacheline_aligned_in_smp; | |
114 | ||
115 | struct user_struct *user; | |
116 | ||
117 | struct completion ctx_done; | |
118 | ||
119 | struct { | |
120 | struct mutex uring_lock; | |
121 | wait_queue_head_t wait; | |
122 | } ____cacheline_aligned_in_smp; | |
123 | ||
124 | struct { | |
125 | spinlock_t completion_lock; | |
126 | } ____cacheline_aligned_in_smp; | |
127 | ||
128 | #if defined(CONFIG_UNIX) | |
129 | struct socket *ring_sock; | |
130 | #endif | |
131 | }; | |
132 | ||
133 | struct sqe_submit { | |
134 | const struct io_uring_sqe *sqe; | |
135 | unsigned short index; | |
136 | bool has_user; | |
137 | }; | |
138 | ||
139 | struct io_kiocb { | |
140 | struct kiocb rw; | |
141 | ||
142 | struct sqe_submit submit; | |
143 | ||
144 | struct io_ring_ctx *ctx; | |
145 | struct list_head list; | |
146 | unsigned int flags; | |
147 | #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ | |
148 | u64 user_data; | |
149 | ||
150 | struct work_struct work; | |
151 | }; | |
152 | ||
153 | #define IO_PLUG_THRESHOLD 2 | |
154 | ||
155 | static struct kmem_cache *req_cachep; | |
156 | ||
157 | static const struct file_operations io_uring_fops; | |
158 | ||
159 | struct sock *io_uring_get_socket(struct file *file) | |
160 | { | |
161 | #if defined(CONFIG_UNIX) | |
162 | if (file->f_op == &io_uring_fops) { | |
163 | struct io_ring_ctx *ctx = file->private_data; | |
164 | ||
165 | return ctx->ring_sock->sk; | |
166 | } | |
167 | #endif | |
168 | return NULL; | |
169 | } | |
170 | EXPORT_SYMBOL(io_uring_get_socket); | |
171 | ||
172 | static void io_ring_ctx_ref_free(struct percpu_ref *ref) | |
173 | { | |
174 | struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); | |
175 | ||
176 | complete(&ctx->ctx_done); | |
177 | } | |
178 | ||
179 | static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) | |
180 | { | |
181 | struct io_ring_ctx *ctx; | |
182 | ||
183 | ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); | |
184 | if (!ctx) | |
185 | return NULL; | |
186 | ||
187 | if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) { | |
188 | kfree(ctx); | |
189 | return NULL; | |
190 | } | |
191 | ||
192 | ctx->flags = p->flags; | |
193 | init_waitqueue_head(&ctx->cq_wait); | |
194 | init_completion(&ctx->ctx_done); | |
195 | mutex_init(&ctx->uring_lock); | |
196 | init_waitqueue_head(&ctx->wait); | |
197 | spin_lock_init(&ctx->completion_lock); | |
198 | return ctx; | |
199 | } | |
200 | ||
201 | static void io_commit_cqring(struct io_ring_ctx *ctx) | |
202 | { | |
203 | struct io_cq_ring *ring = ctx->cq_ring; | |
204 | ||
205 | if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { | |
206 | /* order cqe stores with ring update */ | |
207 | smp_store_release(&ring->r.tail, ctx->cached_cq_tail); | |
208 | ||
209 | /* | |
210 | * Write sider barrier of tail update, app has read side. See | |
211 | * comment at the top of this file. | |
212 | */ | |
213 | smp_wmb(); | |
214 | ||
215 | if (wq_has_sleeper(&ctx->cq_wait)) { | |
216 | wake_up_interruptible(&ctx->cq_wait); | |
217 | kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); | |
218 | } | |
219 | } | |
220 | } | |
221 | ||
222 | static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) | |
223 | { | |
224 | struct io_cq_ring *ring = ctx->cq_ring; | |
225 | unsigned tail; | |
226 | ||
227 | tail = ctx->cached_cq_tail; | |
228 | /* See comment at the top of the file */ | |
229 | smp_rmb(); | |
230 | if (tail + 1 == READ_ONCE(ring->r.head)) | |
231 | return NULL; | |
232 | ||
233 | ctx->cached_cq_tail++; | |
234 | return &ring->cqes[tail & ctx->cq_mask]; | |
235 | } | |
236 | ||
237 | static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, | |
238 | long res, unsigned ev_flags) | |
239 | { | |
240 | struct io_uring_cqe *cqe; | |
241 | ||
242 | /* | |
243 | * If we can't get a cq entry, userspace overflowed the | |
244 | * submission (by quite a lot). Increment the overflow count in | |
245 | * the ring. | |
246 | */ | |
247 | cqe = io_get_cqring(ctx); | |
248 | if (cqe) { | |
249 | WRITE_ONCE(cqe->user_data, ki_user_data); | |
250 | WRITE_ONCE(cqe->res, res); | |
251 | WRITE_ONCE(cqe->flags, ev_flags); | |
252 | } else { | |
253 | unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); | |
254 | ||
255 | WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); | |
256 | } | |
257 | } | |
258 | ||
259 | static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, | |
260 | long res, unsigned ev_flags) | |
261 | { | |
262 | unsigned long flags; | |
263 | ||
264 | spin_lock_irqsave(&ctx->completion_lock, flags); | |
265 | io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); | |
266 | io_commit_cqring(ctx); | |
267 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | |
268 | ||
269 | if (waitqueue_active(&ctx->wait)) | |
270 | wake_up(&ctx->wait); | |
271 | } | |
272 | ||
273 | static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) | |
274 | { | |
275 | percpu_ref_put_many(&ctx->refs, refs); | |
276 | ||
277 | if (waitqueue_active(&ctx->wait)) | |
278 | wake_up(&ctx->wait); | |
279 | } | |
280 | ||
281 | static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) | |
282 | { | |
283 | struct io_kiocb *req; | |
284 | ||
285 | if (!percpu_ref_tryget(&ctx->refs)) | |
286 | return NULL; | |
287 | ||
288 | req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); | |
289 | if (req) { | |
290 | req->ctx = ctx; | |
291 | req->flags = 0; | |
292 | return req; | |
293 | } | |
294 | ||
295 | io_ring_drop_ctx_refs(ctx, 1); | |
296 | return NULL; | |
297 | } | |
298 | ||
299 | static void io_free_req(struct io_kiocb *req) | |
300 | { | |
301 | io_ring_drop_ctx_refs(req->ctx, 1); | |
302 | kmem_cache_free(req_cachep, req); | |
303 | } | |
304 | ||
305 | static void kiocb_end_write(struct kiocb *kiocb) | |
306 | { | |
307 | if (kiocb->ki_flags & IOCB_WRITE) { | |
308 | struct inode *inode = file_inode(kiocb->ki_filp); | |
309 | ||
310 | /* | |
311 | * Tell lockdep we inherited freeze protection from submission | |
312 | * thread. | |
313 | */ | |
314 | if (S_ISREG(inode->i_mode)) | |
315 | __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); | |
316 | file_end_write(kiocb->ki_filp); | |
317 | } | |
318 | } | |
319 | ||
320 | static void io_complete_rw(struct kiocb *kiocb, long res, long res2) | |
321 | { | |
322 | struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); | |
323 | ||
324 | kiocb_end_write(kiocb); | |
325 | ||
326 | fput(kiocb->ki_filp); | |
327 | io_cqring_add_event(req->ctx, req->user_data, res, 0); | |
328 | io_free_req(req); | |
329 | } | |
330 | ||
331 | /* | |
332 | * If we tracked the file through the SCM inflight mechanism, we could support | |
333 | * any file. For now, just ensure that anything potentially problematic is done | |
334 | * inline. | |
335 | */ | |
336 | static bool io_file_supports_async(struct file *file) | |
337 | { | |
338 | umode_t mode = file_inode(file)->i_mode; | |
339 | ||
340 | if (S_ISBLK(mode) || S_ISCHR(mode)) | |
341 | return true; | |
342 | if (S_ISREG(mode) && file->f_op != &io_uring_fops) | |
343 | return true; | |
344 | ||
345 | return false; | |
346 | } | |
347 | ||
348 | static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, | |
349 | bool force_nonblock) | |
350 | { | |
351 | struct kiocb *kiocb = &req->rw; | |
352 | unsigned ioprio; | |
353 | int fd, ret; | |
354 | ||
355 | /* For -EAGAIN retry, everything is already prepped */ | |
356 | if (kiocb->ki_filp) | |
357 | return 0; | |
358 | ||
359 | fd = READ_ONCE(sqe->fd); | |
360 | kiocb->ki_filp = fget(fd); | |
361 | if (unlikely(!kiocb->ki_filp)) | |
362 | return -EBADF; | |
363 | if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) | |
364 | force_nonblock = false; | |
365 | kiocb->ki_pos = READ_ONCE(sqe->off); | |
366 | kiocb->ki_flags = iocb_flags(kiocb->ki_filp); | |
367 | kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); | |
368 | ||
369 | ioprio = READ_ONCE(sqe->ioprio); | |
370 | if (ioprio) { | |
371 | ret = ioprio_check_cap(ioprio); | |
372 | if (ret) | |
373 | goto out_fput; | |
374 | ||
375 | kiocb->ki_ioprio = ioprio; | |
376 | } else | |
377 | kiocb->ki_ioprio = get_current_ioprio(); | |
378 | ||
379 | ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); | |
380 | if (unlikely(ret)) | |
381 | goto out_fput; | |
382 | if (force_nonblock) { | |
383 | kiocb->ki_flags |= IOCB_NOWAIT; | |
384 | req->flags |= REQ_F_FORCE_NONBLOCK; | |
385 | } | |
386 | if (kiocb->ki_flags & IOCB_HIPRI) { | |
387 | ret = -EINVAL; | |
388 | goto out_fput; | |
389 | } | |
390 | ||
391 | kiocb->ki_complete = io_complete_rw; | |
392 | return 0; | |
393 | out_fput: | |
394 | fput(kiocb->ki_filp); | |
395 | return ret; | |
396 | } | |
397 | ||
398 | static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) | |
399 | { | |
400 | switch (ret) { | |
401 | case -EIOCBQUEUED: | |
402 | break; | |
403 | case -ERESTARTSYS: | |
404 | case -ERESTARTNOINTR: | |
405 | case -ERESTARTNOHAND: | |
406 | case -ERESTART_RESTARTBLOCK: | |
407 | /* | |
408 | * We can't just restart the syscall, since previously | |
409 | * submitted sqes may already be in progress. Just fail this | |
410 | * IO with EINTR. | |
411 | */ | |
412 | ret = -EINTR; | |
413 | /* fall through */ | |
414 | default: | |
415 | kiocb->ki_complete(kiocb, ret, 0); | |
416 | } | |
417 | } | |
418 | ||
419 | static int io_import_iovec(struct io_ring_ctx *ctx, int rw, | |
420 | const struct sqe_submit *s, struct iovec **iovec, | |
421 | struct iov_iter *iter) | |
422 | { | |
423 | const struct io_uring_sqe *sqe = s->sqe; | |
424 | void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); | |
425 | size_t sqe_len = READ_ONCE(sqe->len); | |
426 | ||
427 | if (!s->has_user) | |
428 | return -EFAULT; | |
429 | ||
430 | #ifdef CONFIG_COMPAT | |
431 | if (ctx->compat) | |
432 | return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, | |
433 | iovec, iter); | |
434 | #endif | |
435 | ||
436 | return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); | |
437 | } | |
438 | ||
439 | static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, | |
440 | bool force_nonblock) | |
441 | { | |
442 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; | |
443 | struct kiocb *kiocb = &req->rw; | |
444 | struct iov_iter iter; | |
445 | struct file *file; | |
446 | ssize_t ret; | |
447 | ||
448 | ret = io_prep_rw(req, s->sqe, force_nonblock); | |
449 | if (ret) | |
450 | return ret; | |
451 | file = kiocb->ki_filp; | |
452 | ||
453 | ret = -EBADF; | |
454 | if (unlikely(!(file->f_mode & FMODE_READ))) | |
455 | goto out_fput; | |
456 | ret = -EINVAL; | |
457 | if (unlikely(!file->f_op->read_iter)) | |
458 | goto out_fput; | |
459 | ||
460 | ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); | |
461 | if (ret) | |
462 | goto out_fput; | |
463 | ||
464 | ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter)); | |
465 | if (!ret) { | |
466 | ssize_t ret2; | |
467 | ||
468 | /* Catch -EAGAIN return for forced non-blocking submission */ | |
469 | ret2 = call_read_iter(file, kiocb, &iter); | |
470 | if (!force_nonblock || ret2 != -EAGAIN) | |
471 | io_rw_done(kiocb, ret2); | |
472 | else | |
473 | ret = -EAGAIN; | |
474 | } | |
475 | kfree(iovec); | |
476 | out_fput: | |
477 | /* Hold on to the file for -EAGAIN */ | |
478 | if (unlikely(ret && ret != -EAGAIN)) | |
479 | fput(file); | |
480 | return ret; | |
481 | } | |
482 | ||
483 | static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, | |
484 | bool force_nonblock) | |
485 | { | |
486 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; | |
487 | struct kiocb *kiocb = &req->rw; | |
488 | struct iov_iter iter; | |
489 | struct file *file; | |
490 | ssize_t ret; | |
491 | ||
492 | ret = io_prep_rw(req, s->sqe, force_nonblock); | |
493 | if (ret) | |
494 | return ret; | |
495 | /* Hold on to the file for -EAGAIN */ | |
496 | if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) | |
497 | return -EAGAIN; | |
498 | ||
499 | ret = -EBADF; | |
500 | file = kiocb->ki_filp; | |
501 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | |
502 | goto out_fput; | |
503 | ret = -EINVAL; | |
504 | if (unlikely(!file->f_op->write_iter)) | |
505 | goto out_fput; | |
506 | ||
507 | ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); | |
508 | if (ret) | |
509 | goto out_fput; | |
510 | ||
511 | ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, | |
512 | iov_iter_count(&iter)); | |
513 | if (!ret) { | |
514 | /* | |
515 | * Open-code file_start_write here to grab freeze protection, | |
516 | * which will be released by another thread in | |
517 | * io_complete_rw(). Fool lockdep by telling it the lock got | |
518 | * released so that it doesn't complain about the held lock when | |
519 | * we return to userspace. | |
520 | */ | |
521 | if (S_ISREG(file_inode(file)->i_mode)) { | |
522 | __sb_start_write(file_inode(file)->i_sb, | |
523 | SB_FREEZE_WRITE, true); | |
524 | __sb_writers_release(file_inode(file)->i_sb, | |
525 | SB_FREEZE_WRITE); | |
526 | } | |
527 | kiocb->ki_flags |= IOCB_WRITE; | |
528 | io_rw_done(kiocb, call_write_iter(file, kiocb, &iter)); | |
529 | } | |
530 | kfree(iovec); | |
531 | out_fput: | |
532 | if (unlikely(ret)) | |
533 | fput(file); | |
534 | return ret; | |
535 | } | |
536 | ||
537 | /* | |
538 | * IORING_OP_NOP just posts a completion event, nothing else. | |
539 | */ | |
540 | static int io_nop(struct io_kiocb *req, u64 user_data) | |
541 | { | |
542 | struct io_ring_ctx *ctx = req->ctx; | |
543 | long err = 0; | |
544 | ||
545 | /* | |
546 | * Twilight zone - it's possible that someone issued an opcode that | |
547 | * has a file attached, then got -EAGAIN on submission, and changed | |
548 | * the sqe before we retried it from async context. Avoid dropping | |
549 | * a file reference for this malicious case, and flag the error. | |
550 | */ | |
551 | if (req->rw.ki_filp) { | |
552 | err = -EBADF; | |
553 | fput(req->rw.ki_filp); | |
554 | } | |
555 | io_cqring_add_event(ctx, user_data, err, 0); | |
556 | io_free_req(req); | |
557 | return 0; | |
558 | } | |
559 | ||
560 | static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, | |
561 | const struct sqe_submit *s, bool force_nonblock) | |
562 | { | |
563 | ssize_t ret; | |
564 | int opcode; | |
565 | ||
566 | if (unlikely(s->index >= ctx->sq_entries)) | |
567 | return -EINVAL; | |
568 | req->user_data = READ_ONCE(s->sqe->user_data); | |
569 | ||
570 | opcode = READ_ONCE(s->sqe->opcode); | |
571 | switch (opcode) { | |
572 | case IORING_OP_NOP: | |
573 | ret = io_nop(req, req->user_data); | |
574 | break; | |
575 | case IORING_OP_READV: | |
576 | ret = io_read(req, s, force_nonblock); | |
577 | break; | |
578 | case IORING_OP_WRITEV: | |
579 | ret = io_write(req, s, force_nonblock); | |
580 | break; | |
581 | default: | |
582 | ret = -EINVAL; | |
583 | break; | |
584 | } | |
585 | ||
586 | return ret; | |
587 | } | |
588 | ||
589 | static void io_sq_wq_submit_work(struct work_struct *work) | |
590 | { | |
591 | struct io_kiocb *req = container_of(work, struct io_kiocb, work); | |
592 | struct sqe_submit *s = &req->submit; | |
593 | const struct io_uring_sqe *sqe = s->sqe; | |
594 | struct io_ring_ctx *ctx = req->ctx; | |
595 | mm_segment_t old_fs = get_fs(); | |
596 | int ret; | |
597 | ||
598 | /* Ensure we clear previously set forced non-block flag */ | |
599 | req->flags &= ~REQ_F_FORCE_NONBLOCK; | |
600 | req->rw.ki_flags &= ~IOCB_NOWAIT; | |
601 | ||
602 | if (!mmget_not_zero(ctx->sqo_mm)) { | |
603 | ret = -EFAULT; | |
604 | goto err; | |
605 | } | |
606 | ||
607 | use_mm(ctx->sqo_mm); | |
608 | set_fs(USER_DS); | |
609 | s->has_user = true; | |
610 | ||
611 | ret = __io_submit_sqe(ctx, req, s, false); | |
612 | ||
613 | set_fs(old_fs); | |
614 | unuse_mm(ctx->sqo_mm); | |
615 | mmput(ctx->sqo_mm); | |
616 | err: | |
617 | if (ret) { | |
618 | io_cqring_add_event(ctx, sqe->user_data, ret, 0); | |
619 | io_free_req(req); | |
620 | } | |
621 | ||
622 | /* async context always use a copy of the sqe */ | |
623 | kfree(sqe); | |
624 | } | |
625 | ||
626 | static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s) | |
627 | { | |
628 | struct io_kiocb *req; | |
629 | ssize_t ret; | |
630 | ||
631 | /* enforce forwards compatibility on users */ | |
632 | if (unlikely(s->sqe->flags)) | |
633 | return -EINVAL; | |
634 | ||
635 | req = io_get_req(ctx); | |
636 | if (unlikely(!req)) | |
637 | return -EAGAIN; | |
638 | ||
639 | req->rw.ki_filp = NULL; | |
640 | ||
641 | ret = __io_submit_sqe(ctx, req, s, true); | |
642 | if (ret == -EAGAIN) { | |
643 | struct io_uring_sqe *sqe_copy; | |
644 | ||
645 | sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); | |
646 | if (sqe_copy) { | |
647 | memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); | |
648 | s->sqe = sqe_copy; | |
649 | ||
650 | memcpy(&req->submit, s, sizeof(*s)); | |
651 | INIT_WORK(&req->work, io_sq_wq_submit_work); | |
652 | queue_work(ctx->sqo_wq, &req->work); | |
653 | ret = 0; | |
654 | } | |
655 | } | |
656 | if (ret) | |
657 | io_free_req(req); | |
658 | ||
659 | return ret; | |
660 | } | |
661 | ||
662 | static void io_commit_sqring(struct io_ring_ctx *ctx) | |
663 | { | |
664 | struct io_sq_ring *ring = ctx->sq_ring; | |
665 | ||
666 | if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { | |
667 | /* | |
668 | * Ensure any loads from the SQEs are done at this point, | |
669 | * since once we write the new head, the application could | |
670 | * write new data to them. | |
671 | */ | |
672 | smp_store_release(&ring->r.head, ctx->cached_sq_head); | |
673 | ||
674 | /* | |
675 | * write side barrier of head update, app has read side. See | |
676 | * comment at the top of this file | |
677 | */ | |
678 | smp_wmb(); | |
679 | } | |
680 | } | |
681 | ||
682 | /* | |
683 | * Undo last io_get_sqring() | |
684 | */ | |
685 | static void io_drop_sqring(struct io_ring_ctx *ctx) | |
686 | { | |
687 | ctx->cached_sq_head--; | |
688 | } | |
689 | ||
690 | /* | |
691 | * Fetch an sqe, if one is available. Note that s->sqe will point to memory | |
692 | * that is mapped by userspace. This means that care needs to be taken to | |
693 | * ensure that reads are stable, as we cannot rely on userspace always | |
694 | * being a good citizen. If members of the sqe are validated and then later | |
695 | * used, it's important that those reads are done through READ_ONCE() to | |
696 | * prevent a re-load down the line. | |
697 | */ | |
698 | static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) | |
699 | { | |
700 | struct io_sq_ring *ring = ctx->sq_ring; | |
701 | unsigned head; | |
702 | ||
703 | /* | |
704 | * The cached sq head (or cq tail) serves two purposes: | |
705 | * | |
706 | * 1) allows us to batch the cost of updating the user visible | |
707 | * head updates. | |
708 | * 2) allows the kernel side to track the head on its own, even | |
709 | * though the application is the one updating it. | |
710 | */ | |
711 | head = ctx->cached_sq_head; | |
712 | /* See comment at the top of this file */ | |
713 | smp_rmb(); | |
714 | if (head == READ_ONCE(ring->r.tail)) | |
715 | return false; | |
716 | ||
717 | head = READ_ONCE(ring->array[head & ctx->sq_mask]); | |
718 | if (head < ctx->sq_entries) { | |
719 | s->index = head; | |
720 | s->sqe = &ctx->sq_sqes[head]; | |
721 | ctx->cached_sq_head++; | |
722 | return true; | |
723 | } | |
724 | ||
725 | /* drop invalid entries */ | |
726 | ctx->cached_sq_head++; | |
727 | ring->dropped++; | |
728 | /* See comment at the top of this file */ | |
729 | smp_wmb(); | |
730 | return false; | |
731 | } | |
732 | ||
733 | static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) | |
734 | { | |
735 | int i, ret = 0, submit = 0; | |
736 | struct blk_plug plug; | |
737 | ||
738 | if (to_submit > IO_PLUG_THRESHOLD) | |
739 | blk_start_plug(&plug); | |
740 | ||
741 | for (i = 0; i < to_submit; i++) { | |
742 | struct sqe_submit s; | |
743 | ||
744 | if (!io_get_sqring(ctx, &s)) | |
745 | break; | |
746 | ||
747 | s.has_user = true; | |
748 | ret = io_submit_sqe(ctx, &s); | |
749 | if (ret) { | |
750 | io_drop_sqring(ctx); | |
751 | break; | |
752 | } | |
753 | ||
754 | submit++; | |
755 | } | |
756 | io_commit_sqring(ctx); | |
757 | ||
758 | if (to_submit > IO_PLUG_THRESHOLD) | |
759 | blk_finish_plug(&plug); | |
760 | ||
761 | return submit ? submit : ret; | |
762 | } | |
763 | ||
764 | static unsigned io_cqring_events(struct io_cq_ring *ring) | |
765 | { | |
766 | return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); | |
767 | } | |
768 | ||
769 | /* | |
770 | * Wait until events become available, if we don't already have some. The | |
771 | * application must reap them itself, as they reside on the shared cq ring. | |
772 | */ | |
773 | static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, | |
774 | const sigset_t __user *sig, size_t sigsz) | |
775 | { | |
776 | struct io_cq_ring *ring = ctx->cq_ring; | |
777 | sigset_t ksigmask, sigsaved; | |
778 | DEFINE_WAIT(wait); | |
779 | int ret; | |
780 | ||
781 | /* See comment at the top of this file */ | |
782 | smp_rmb(); | |
783 | if (io_cqring_events(ring) >= min_events) | |
784 | return 0; | |
785 | ||
786 | if (sig) { | |
787 | ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz); | |
788 | if (ret) | |
789 | return ret; | |
790 | } | |
791 | ||
792 | do { | |
793 | prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); | |
794 | ||
795 | ret = 0; | |
796 | /* See comment at the top of this file */ | |
797 | smp_rmb(); | |
798 | if (io_cqring_events(ring) >= min_events) | |
799 | break; | |
800 | ||
801 | schedule(); | |
802 | ||
803 | ret = -EINTR; | |
804 | if (signal_pending(current)) | |
805 | break; | |
806 | } while (1); | |
807 | ||
808 | finish_wait(&ctx->wait, &wait); | |
809 | ||
810 | if (sig) | |
811 | restore_user_sigmask(sig, &sigsaved); | |
812 | ||
813 | return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; | |
814 | } | |
815 | ||
816 | static int io_sq_offload_start(struct io_ring_ctx *ctx) | |
817 | { | |
818 | int ret; | |
819 | ||
820 | mmgrab(current->mm); | |
821 | ctx->sqo_mm = current->mm; | |
822 | ||
823 | /* Do QD, or 2 * CPUS, whatever is smallest */ | |
824 | ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, | |
825 | min(ctx->sq_entries - 1, 2 * num_online_cpus())); | |
826 | if (!ctx->sqo_wq) { | |
827 | ret = -ENOMEM; | |
828 | goto err; | |
829 | } | |
830 | ||
831 | return 0; | |
832 | err: | |
833 | mmdrop(ctx->sqo_mm); | |
834 | ctx->sqo_mm = NULL; | |
835 | return ret; | |
836 | } | |
837 | ||
838 | static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) | |
839 | { | |
840 | atomic_long_sub(nr_pages, &user->locked_vm); | |
841 | } | |
842 | ||
843 | static int io_account_mem(struct user_struct *user, unsigned long nr_pages) | |
844 | { | |
845 | unsigned long page_limit, cur_pages, new_pages; | |
846 | ||
847 | /* Don't allow more pages than we can safely lock */ | |
848 | page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | |
849 | ||
850 | do { | |
851 | cur_pages = atomic_long_read(&user->locked_vm); | |
852 | new_pages = cur_pages + nr_pages; | |
853 | if (new_pages > page_limit) | |
854 | return -ENOMEM; | |
855 | } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, | |
856 | new_pages) != cur_pages); | |
857 | ||
858 | return 0; | |
859 | } | |
860 | ||
861 | static void io_mem_free(void *ptr) | |
862 | { | |
863 | struct page *page = virt_to_head_page(ptr); | |
864 | ||
865 | if (put_page_testzero(page)) | |
866 | free_compound_page(page); | |
867 | } | |
868 | ||
869 | static void *io_mem_alloc(size_t size) | |
870 | { | |
871 | gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | | |
872 | __GFP_NORETRY; | |
873 | ||
874 | return (void *) __get_free_pages(gfp_flags, get_order(size)); | |
875 | } | |
876 | ||
877 | static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) | |
878 | { | |
879 | struct io_sq_ring *sq_ring; | |
880 | struct io_cq_ring *cq_ring; | |
881 | size_t bytes; | |
882 | ||
883 | bytes = struct_size(sq_ring, array, sq_entries); | |
884 | bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); | |
885 | bytes += struct_size(cq_ring, cqes, cq_entries); | |
886 | ||
887 | return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; | |
888 | } | |
889 | ||
890 | static void io_ring_ctx_free(struct io_ring_ctx *ctx) | |
891 | { | |
892 | if (ctx->sqo_wq) | |
893 | destroy_workqueue(ctx->sqo_wq); | |
894 | if (ctx->sqo_mm) | |
895 | mmdrop(ctx->sqo_mm); | |
896 | #if defined(CONFIG_UNIX) | |
897 | if (ctx->ring_sock) | |
898 | sock_release(ctx->ring_sock); | |
899 | #endif | |
900 | ||
901 | io_mem_free(ctx->sq_ring); | |
902 | io_mem_free(ctx->sq_sqes); | |
903 | io_mem_free(ctx->cq_ring); | |
904 | ||
905 | percpu_ref_exit(&ctx->refs); | |
906 | if (ctx->account_mem) | |
907 | io_unaccount_mem(ctx->user, | |
908 | ring_pages(ctx->sq_entries, ctx->cq_entries)); | |
909 | free_uid(ctx->user); | |
910 | kfree(ctx); | |
911 | } | |
912 | ||
913 | static __poll_t io_uring_poll(struct file *file, poll_table *wait) | |
914 | { | |
915 | struct io_ring_ctx *ctx = file->private_data; | |
916 | __poll_t mask = 0; | |
917 | ||
918 | poll_wait(file, &ctx->cq_wait, wait); | |
919 | /* See comment at the top of this file */ | |
920 | smp_rmb(); | |
921 | if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) | |
922 | mask |= EPOLLOUT | EPOLLWRNORM; | |
923 | if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) | |
924 | mask |= EPOLLIN | EPOLLRDNORM; | |
925 | ||
926 | return mask; | |
927 | } | |
928 | ||
929 | static int io_uring_fasync(int fd, struct file *file, int on) | |
930 | { | |
931 | struct io_ring_ctx *ctx = file->private_data; | |
932 | ||
933 | return fasync_helper(fd, file, on, &ctx->cq_fasync); | |
934 | } | |
935 | ||
936 | static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) | |
937 | { | |
938 | mutex_lock(&ctx->uring_lock); | |
939 | percpu_ref_kill(&ctx->refs); | |
940 | mutex_unlock(&ctx->uring_lock); | |
941 | ||
942 | wait_for_completion(&ctx->ctx_done); | |
943 | io_ring_ctx_free(ctx); | |
944 | } | |
945 | ||
946 | static int io_uring_release(struct inode *inode, struct file *file) | |
947 | { | |
948 | struct io_ring_ctx *ctx = file->private_data; | |
949 | ||
950 | file->private_data = NULL; | |
951 | io_ring_ctx_wait_and_kill(ctx); | |
952 | return 0; | |
953 | } | |
954 | ||
955 | static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) | |
956 | { | |
957 | loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; | |
958 | unsigned long sz = vma->vm_end - vma->vm_start; | |
959 | struct io_ring_ctx *ctx = file->private_data; | |
960 | unsigned long pfn; | |
961 | struct page *page; | |
962 | void *ptr; | |
963 | ||
964 | switch (offset) { | |
965 | case IORING_OFF_SQ_RING: | |
966 | ptr = ctx->sq_ring; | |
967 | break; | |
968 | case IORING_OFF_SQES: | |
969 | ptr = ctx->sq_sqes; | |
970 | break; | |
971 | case IORING_OFF_CQ_RING: | |
972 | ptr = ctx->cq_ring; | |
973 | break; | |
974 | default: | |
975 | return -EINVAL; | |
976 | } | |
977 | ||
978 | page = virt_to_head_page(ptr); | |
979 | if (sz > (PAGE_SIZE << compound_order(page))) | |
980 | return -EINVAL; | |
981 | ||
982 | pfn = virt_to_phys(ptr) >> PAGE_SHIFT; | |
983 | return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); | |
984 | } | |
985 | ||
986 | SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, | |
987 | u32, min_complete, u32, flags, const sigset_t __user *, sig, | |
988 | size_t, sigsz) | |
989 | { | |
990 | struct io_ring_ctx *ctx; | |
991 | long ret = -EBADF; | |
992 | int submitted = 0; | |
993 | struct fd f; | |
994 | ||
995 | if (flags & ~IORING_ENTER_GETEVENTS) | |
996 | return -EINVAL; | |
997 | ||
998 | f = fdget(fd); | |
999 | if (!f.file) | |
1000 | return -EBADF; | |
1001 | ||
1002 | ret = -EOPNOTSUPP; | |
1003 | if (f.file->f_op != &io_uring_fops) | |
1004 | goto out_fput; | |
1005 | ||
1006 | ret = -ENXIO; | |
1007 | ctx = f.file->private_data; | |
1008 | if (!percpu_ref_tryget(&ctx->refs)) | |
1009 | goto out_fput; | |
1010 | ||
1011 | ret = 0; | |
1012 | if (to_submit) { | |
1013 | to_submit = min(to_submit, ctx->sq_entries); | |
1014 | ||
1015 | mutex_lock(&ctx->uring_lock); | |
1016 | submitted = io_ring_submit(ctx, to_submit); | |
1017 | mutex_unlock(&ctx->uring_lock); | |
1018 | ||
1019 | if (submitted < 0) | |
1020 | goto out_ctx; | |
1021 | } | |
1022 | if (flags & IORING_ENTER_GETEVENTS) { | |
1023 | min_complete = min(min_complete, ctx->cq_entries); | |
1024 | ||
1025 | /* | |
1026 | * The application could have included the 'to_submit' count | |
1027 | * in how many events it wanted to wait for. If we failed to | |
1028 | * submit the desired count, we may need to adjust the number | |
1029 | * of events to poll/wait for. | |
1030 | */ | |
1031 | if (submitted < to_submit) | |
1032 | min_complete = min_t(unsigned, submitted, min_complete); | |
1033 | ||
1034 | ret = io_cqring_wait(ctx, min_complete, sig, sigsz); | |
1035 | } | |
1036 | ||
1037 | out_ctx: | |
1038 | io_ring_drop_ctx_refs(ctx, 1); | |
1039 | out_fput: | |
1040 | fdput(f); | |
1041 | return submitted ? submitted : ret; | |
1042 | } | |
1043 | ||
1044 | static const struct file_operations io_uring_fops = { | |
1045 | .release = io_uring_release, | |
1046 | .mmap = io_uring_mmap, | |
1047 | .poll = io_uring_poll, | |
1048 | .fasync = io_uring_fasync, | |
1049 | }; | |
1050 | ||
1051 | static int io_allocate_scq_urings(struct io_ring_ctx *ctx, | |
1052 | struct io_uring_params *p) | |
1053 | { | |
1054 | struct io_sq_ring *sq_ring; | |
1055 | struct io_cq_ring *cq_ring; | |
1056 | size_t size; | |
1057 | ||
1058 | sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); | |
1059 | if (!sq_ring) | |
1060 | return -ENOMEM; | |
1061 | ||
1062 | ctx->sq_ring = sq_ring; | |
1063 | sq_ring->ring_mask = p->sq_entries - 1; | |
1064 | sq_ring->ring_entries = p->sq_entries; | |
1065 | ctx->sq_mask = sq_ring->ring_mask; | |
1066 | ctx->sq_entries = sq_ring->ring_entries; | |
1067 | ||
1068 | size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); | |
1069 | if (size == SIZE_MAX) | |
1070 | return -EOVERFLOW; | |
1071 | ||
1072 | ctx->sq_sqes = io_mem_alloc(size); | |
1073 | if (!ctx->sq_sqes) { | |
1074 | io_mem_free(ctx->sq_ring); | |
1075 | return -ENOMEM; | |
1076 | } | |
1077 | ||
1078 | cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); | |
1079 | if (!cq_ring) { | |
1080 | io_mem_free(ctx->sq_ring); | |
1081 | io_mem_free(ctx->sq_sqes); | |
1082 | return -ENOMEM; | |
1083 | } | |
1084 | ||
1085 | ctx->cq_ring = cq_ring; | |
1086 | cq_ring->ring_mask = p->cq_entries - 1; | |
1087 | cq_ring->ring_entries = p->cq_entries; | |
1088 | ctx->cq_mask = cq_ring->ring_mask; | |
1089 | ctx->cq_entries = cq_ring->ring_entries; | |
1090 | return 0; | |
1091 | } | |
1092 | ||
1093 | /* | |
1094 | * Allocate an anonymous fd, this is what constitutes the application | |
1095 | * visible backing of an io_uring instance. The application mmaps this | |
1096 | * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, | |
1097 | * we have to tie this fd to a socket for file garbage collection purposes. | |
1098 | */ | |
1099 | static int io_uring_get_fd(struct io_ring_ctx *ctx) | |
1100 | { | |
1101 | struct file *file; | |
1102 | int ret; | |
1103 | ||
1104 | #if defined(CONFIG_UNIX) | |
1105 | ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, | |
1106 | &ctx->ring_sock); | |
1107 | if (ret) | |
1108 | return ret; | |
1109 | #endif | |
1110 | ||
1111 | ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); | |
1112 | if (ret < 0) | |
1113 | goto err; | |
1114 | ||
1115 | file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, | |
1116 | O_RDWR | O_CLOEXEC); | |
1117 | if (IS_ERR(file)) { | |
1118 | put_unused_fd(ret); | |
1119 | ret = PTR_ERR(file); | |
1120 | goto err; | |
1121 | } | |
1122 | ||
1123 | #if defined(CONFIG_UNIX) | |
1124 | ctx->ring_sock->file = file; | |
1125 | #endif | |
1126 | fd_install(ret, file); | |
1127 | return ret; | |
1128 | err: | |
1129 | #if defined(CONFIG_UNIX) | |
1130 | sock_release(ctx->ring_sock); | |
1131 | ctx->ring_sock = NULL; | |
1132 | #endif | |
1133 | return ret; | |
1134 | } | |
1135 | ||
1136 | static int io_uring_create(unsigned entries, struct io_uring_params *p) | |
1137 | { | |
1138 | struct user_struct *user = NULL; | |
1139 | struct io_ring_ctx *ctx; | |
1140 | bool account_mem; | |
1141 | int ret; | |
1142 | ||
1143 | if (!entries || entries > IORING_MAX_ENTRIES) | |
1144 | return -EINVAL; | |
1145 | ||
1146 | /* | |
1147 | * Use twice as many entries for the CQ ring. It's possible for the | |
1148 | * application to drive a higher depth than the size of the SQ ring, | |
1149 | * since the sqes are only used at submission time. This allows for | |
1150 | * some flexibility in overcommitting a bit. | |
1151 | */ | |
1152 | p->sq_entries = roundup_pow_of_two(entries); | |
1153 | p->cq_entries = 2 * p->sq_entries; | |
1154 | ||
1155 | user = get_uid(current_user()); | |
1156 | account_mem = !capable(CAP_IPC_LOCK); | |
1157 | ||
1158 | if (account_mem) { | |
1159 | ret = io_account_mem(user, | |
1160 | ring_pages(p->sq_entries, p->cq_entries)); | |
1161 | if (ret) { | |
1162 | free_uid(user); | |
1163 | return ret; | |
1164 | } | |
1165 | } | |
1166 | ||
1167 | ctx = io_ring_ctx_alloc(p); | |
1168 | if (!ctx) { | |
1169 | if (account_mem) | |
1170 | io_unaccount_mem(user, ring_pages(p->sq_entries, | |
1171 | p->cq_entries)); | |
1172 | free_uid(user); | |
1173 | return -ENOMEM; | |
1174 | } | |
1175 | ctx->compat = in_compat_syscall(); | |
1176 | ctx->account_mem = account_mem; | |
1177 | ctx->user = user; | |
1178 | ||
1179 | ret = io_allocate_scq_urings(ctx, p); | |
1180 | if (ret) | |
1181 | goto err; | |
1182 | ||
1183 | ret = io_sq_offload_start(ctx); | |
1184 | if (ret) | |
1185 | goto err; | |
1186 | ||
1187 | ret = io_uring_get_fd(ctx); | |
1188 | if (ret < 0) | |
1189 | goto err; | |
1190 | ||
1191 | memset(&p->sq_off, 0, sizeof(p->sq_off)); | |
1192 | p->sq_off.head = offsetof(struct io_sq_ring, r.head); | |
1193 | p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); | |
1194 | p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); | |
1195 | p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); | |
1196 | p->sq_off.flags = offsetof(struct io_sq_ring, flags); | |
1197 | p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); | |
1198 | p->sq_off.array = offsetof(struct io_sq_ring, array); | |
1199 | ||
1200 | memset(&p->cq_off, 0, sizeof(p->cq_off)); | |
1201 | p->cq_off.head = offsetof(struct io_cq_ring, r.head); | |
1202 | p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); | |
1203 | p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); | |
1204 | p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); | |
1205 | p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); | |
1206 | p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); | |
1207 | return ret; | |
1208 | err: | |
1209 | io_ring_ctx_wait_and_kill(ctx); | |
1210 | return ret; | |
1211 | } | |
1212 | ||
1213 | /* | |
1214 | * Sets up an aio uring context, and returns the fd. Applications asks for a | |
1215 | * ring size, we return the actual sq/cq ring sizes (among other things) in the | |
1216 | * params structure passed in. | |
1217 | */ | |
1218 | static long io_uring_setup(u32 entries, struct io_uring_params __user *params) | |
1219 | { | |
1220 | struct io_uring_params p; | |
1221 | long ret; | |
1222 | int i; | |
1223 | ||
1224 | if (copy_from_user(&p, params, sizeof(p))) | |
1225 | return -EFAULT; | |
1226 | for (i = 0; i < ARRAY_SIZE(p.resv); i++) { | |
1227 | if (p.resv[i]) | |
1228 | return -EINVAL; | |
1229 | } | |
1230 | ||
1231 | if (p.flags) | |
1232 | return -EINVAL; | |
1233 | ||
1234 | ret = io_uring_create(entries, &p); | |
1235 | if (ret < 0) | |
1236 | return ret; | |
1237 | ||
1238 | if (copy_to_user(params, &p, sizeof(p))) | |
1239 | return -EFAULT; | |
1240 | ||
1241 | return ret; | |
1242 | } | |
1243 | ||
1244 | SYSCALL_DEFINE2(io_uring_setup, u32, entries, | |
1245 | struct io_uring_params __user *, params) | |
1246 | { | |
1247 | return io_uring_setup(entries, params); | |
1248 | } | |
1249 | ||
1250 | static int __init io_uring_init(void) | |
1251 | { | |
1252 | req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); | |
1253 | return 0; | |
1254 | }; | |
1255 | __initcall(io_uring_init); |