]> Git Repo - linux.git/blame - fs/pipe.c
keys: Make the KEY_NEED_* perms an enum rather than a mask
[linux.git] / fs / pipe.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
1da177e4
LT
2/*
3 * linux/fs/pipe.c
4 *
5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds
6 */
7
8#include <linux/mm.h>
9#include <linux/file.h>
10#include <linux/poll.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/fs.h>
35f3d14d 15#include <linux/log2.h>
1da177e4 16#include <linux/mount.h>
4fa7ec5d 17#include <linux/pseudo_fs.h>
b502bd11 18#include <linux/magic.h>
1da177e4
LT
19#include <linux/pipe_fs_i.h>
20#include <linux/uio.h>
21#include <linux/highmem.h>
5274f052 22#include <linux/pagemap.h>
db349509 23#include <linux/audit.h>
ba719bae 24#include <linux/syscalls.h>
b492e95b 25#include <linux/fcntl.h>
d86133bd 26#include <linux/memcontrol.h>
c73be61c 27#include <linux/watch_queue.h>
1da177e4 28
7c0f6ba6 29#include <linux/uaccess.h>
1da177e4
LT
30#include <asm/ioctls.h>
31
599a0ac1
AV
32#include "internal.h"
33
b492e95b
JA
34/*
35 * The max size that a non-root user is allowed to grow the pipe. Can
ff9da691 36 * be set by root in /proc/sys/fs/pipe-max-size
b492e95b 37 */
ff9da691
JA
38unsigned int pipe_max_size = 1048576;
39
759c0114
WT
40/* Maximum allocatable pages per user. Hard limit is unset by default, soft
41 * matches default values.
42 */
43unsigned long pipe_user_pages_hard;
44unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
45
1da177e4 46/*
8cefc107
DH
47 * We use head and tail indices that aren't masked off, except at the point of
48 * dereference, but rather they're allowed to wrap naturally. This means there
49 * isn't a dead spot in the buffer, but the ring has to be a power of two and
50 * <= 2^31.
51 * -- David Howells 2019-09-23.
52 *
1da177e4
LT
53 * Reads with count = 0 should always return 0.
54 * -- Julian Bradfield 1999-06-07.
55 *
56 * FIFOs and Pipes now generate SIGIO for both readers and writers.
57 * -- Jeremy Elson <[email protected]> 2001-08-16
58 *
59 * pipe_read & write cleanup
60 * -- Manfred Spraul <[email protected]> 2002-05-09
61 */
62
61e0d47c
MS
63static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
64{
6447a3cf 65 if (pipe->files)
72b0d9aa 66 mutex_lock_nested(&pipe->mutex, subclass);
61e0d47c
MS
67}
68
69void pipe_lock(struct pipe_inode_info *pipe)
70{
71 /*
72 * pipe_lock() nests non-pipe inode locks (for writing to a file)
73 */
74 pipe_lock_nested(pipe, I_MUTEX_PARENT);
75}
76EXPORT_SYMBOL(pipe_lock);
77
78void pipe_unlock(struct pipe_inode_info *pipe)
79{
6447a3cf 80 if (pipe->files)
72b0d9aa 81 mutex_unlock(&pipe->mutex);
61e0d47c
MS
82}
83EXPORT_SYMBOL(pipe_unlock);
84
ebec73f4
AV
85static inline void __pipe_lock(struct pipe_inode_info *pipe)
86{
87 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
88}
89
90static inline void __pipe_unlock(struct pipe_inode_info *pipe)
91{
92 mutex_unlock(&pipe->mutex);
93}
94
61e0d47c
MS
95void pipe_double_lock(struct pipe_inode_info *pipe1,
96 struct pipe_inode_info *pipe2)
97{
98 BUG_ON(pipe1 == pipe2);
99
100 if (pipe1 < pipe2) {
101 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
102 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
103 } else {
023d43c7
PZ
104 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
105 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
61e0d47c
MS
106 }
107}
108
1da177e4 109/* Drop the inode semaphore and wait for a pipe event, atomically */
3a326a2c 110void pipe_wait(struct pipe_inode_info *pipe)
1da177e4 111{
0ddad21d
LT
112 DEFINE_WAIT(rdwait);
113 DEFINE_WAIT(wrwait);
1da177e4 114
d79fc0fc
IM
115 /*
116 * Pipes are system-local resources, so sleeping on them
117 * is considered a noninteractive wait:
118 */
0ddad21d
LT
119 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
120 prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
61e0d47c 121 pipe_unlock(pipe);
1da177e4 122 schedule();
0ddad21d
LT
123 finish_wait(&pipe->rd_wait, &rdwait);
124 finish_wait(&pipe->wr_wait, &wrwait);
61e0d47c 125 pipe_lock(pipe);
1da177e4
LT
126}
127
341b446b
IM
128static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
129 struct pipe_buffer *buf)
1da177e4
LT
130{
131 struct page *page = buf->page;
132
5274f052
JA
133 /*
134 * If nobody else uses this page, and we don't already have a
135 * temporary page, let's keep track of it as a one-deep
341b446b 136 * allocation cache. (Otherwise just release our reference to it)
5274f052 137 */
341b446b 138 if (page_count(page) == 1 && !pipe->tmp_page)
923f4f23 139 pipe->tmp_page = page;
341b446b 140 else
09cbfeaf 141 put_page(page);
1da177e4
LT
142}
143
d86133bd
VD
144static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
145 struct pipe_buffer *buf)
146{
147 struct page *page = buf->page;
148
149 if (page_count(page) == 1) {
f4b00eab 150 memcg_kmem_uncharge_page(page, 0);
d86133bd
VD
151 __SetPageLocked(page);
152 return 0;
153 }
154 return 1;
155}
156
0845718d 157/**
b51d63c6 158 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
0845718d
JA
159 * @pipe: the pipe that the buffer belongs to
160 * @buf: the buffer to attempt to steal
161 *
162 * Description:
b51d63c6 163 * This function attempts to steal the &struct page attached to
0845718d
JA
164 * @buf. If successful, this function returns 0 and returns with
165 * the page locked. The caller may then reuse the page for whatever
b51d63c6 166 * he wishes; the typical use is insertion into a different file
0845718d
JA
167 * page cache.
168 */
330ab716
JA
169int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
170 struct pipe_buffer *buf)
5abc97aa 171{
46e678c9
JA
172 struct page *page = buf->page;
173
0845718d
JA
174 /*
175 * A reference of one is golden, that means that the owner of this
176 * page is the only one holding a reference to it. lock the page
177 * and return OK.
178 */
46e678c9 179 if (page_count(page) == 1) {
46e678c9
JA
180 lock_page(page);
181 return 0;
182 }
183
184 return 1;
5abc97aa 185}
51921cb7 186EXPORT_SYMBOL(generic_pipe_buf_steal);
5abc97aa 187
0845718d 188/**
b51d63c6 189 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
0845718d
JA
190 * @pipe: the pipe that the buffer belongs to
191 * @buf: the buffer to get a reference to
192 *
193 * Description:
194 * This function grabs an extra reference to @buf. It's used in
195 * in the tee() system call, when we duplicate the buffers in one
196 * pipe into another.
197 */
15fab63e 198bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
70524490 199{
15fab63e 200 return try_get_page(buf->page);
70524490 201}
51921cb7 202EXPORT_SYMBOL(generic_pipe_buf_get);
70524490 203
0845718d
JA
204/**
205 * generic_pipe_buf_confirm - verify contents of the pipe buffer
79685b8d 206 * @info: the pipe that the buffer belongs to
0845718d
JA
207 * @buf: the buffer to confirm
208 *
209 * Description:
210 * This function does nothing, because the generic pipe code uses
211 * pages that are always good when inserted into the pipe.
212 */
cac36bb0
JA
213int generic_pipe_buf_confirm(struct pipe_inode_info *info,
214 struct pipe_buffer *buf)
f84d7519
JA
215{
216 return 0;
217}
51921cb7 218EXPORT_SYMBOL(generic_pipe_buf_confirm);
f84d7519 219
6818173b
MS
220/**
221 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
222 * @pipe: the pipe that the buffer belongs to
223 * @buf: the buffer to put a reference to
224 *
225 * Description:
226 * This function releases a reference to @buf.
227 */
228void generic_pipe_buf_release(struct pipe_inode_info *pipe,
229 struct pipe_buffer *buf)
230{
09cbfeaf 231 put_page(buf->page);
6818173b 232}
51921cb7 233EXPORT_SYMBOL(generic_pipe_buf_release);
6818173b 234
01e7187b 235/* New data written to a pipe may be appended to a buffer with this type. */
d4c3cca9 236static const struct pipe_buf_operations anon_pipe_buf_ops = {
cac36bb0 237 .confirm = generic_pipe_buf_confirm,
1da177e4 238 .release = anon_pipe_buf_release,
d86133bd 239 .steal = anon_pipe_buf_steal,
f84d7519 240 .get = generic_pipe_buf_get,
1da177e4
LT
241};
242
a0ce2f0a 243static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
cac36bb0 244 .confirm = generic_pipe_buf_confirm,
1da177e4 245 .release = anon_pipe_buf_release,
d86133bd 246 .steal = anon_pipe_buf_steal,
f84d7519 247 .get = generic_pipe_buf_get,
1da177e4
LT
248};
249
9883035a 250static const struct pipe_buf_operations packet_pipe_buf_ops = {
9883035a
LT
251 .confirm = generic_pipe_buf_confirm,
252 .release = anon_pipe_buf_release,
d86133bd 253 .steal = anon_pipe_buf_steal,
9883035a
LT
254 .get = generic_pipe_buf_get,
255};
256
01e7187b
JH
257/**
258 * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
259 * @buf: the buffer to mark
260 *
261 * Description:
262 * This function ensures that no future writes will be merged into the
263 * given &struct pipe_buffer. This is necessary when multiple pipe buffers
264 * share the same backing page.
265 */
a0ce2f0a
JH
266void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
267{
268 if (buf->ops == &anon_pipe_buf_ops)
269 buf->ops = &anon_pipe_buf_nomerge_ops;
270}
271
01e7187b
JH
272static bool pipe_buf_can_merge(struct pipe_buffer *buf)
273{
274 return buf->ops == &anon_pipe_buf_ops;
275}
276
85190d15
LT
277/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
278static inline bool pipe_readable(const struct pipe_inode_info *pipe)
279{
280 unsigned int head = READ_ONCE(pipe->head);
281 unsigned int tail = READ_ONCE(pipe->tail);
282 unsigned int writers = READ_ONCE(pipe->writers);
283
284 return !pipe_empty(head, tail) || !writers;
285}
286
1da177e4 287static ssize_t
fb9096a3 288pipe_read(struct kiocb *iocb, struct iov_iter *to)
1da177e4 289{
fb9096a3 290 size_t total_len = iov_iter_count(to);
ee0b3e67 291 struct file *filp = iocb->ki_filp;
de32ec4c 292 struct pipe_inode_info *pipe = filp->private_data;
0ddad21d 293 bool was_full, wake_next_reader = false;
1da177e4 294 ssize_t ret;
1da177e4 295
1da177e4
LT
296 /* Null read succeeds. */
297 if (unlikely(total_len == 0))
298 return 0;
299
1da177e4 300 ret = 0;
ebec73f4 301 __pipe_lock(pipe);
f467a6a6
LT
302
303 /*
304 * We only wake up writers if the pipe was full when we started
305 * reading in order to avoid unnecessary wakeups.
306 *
307 * But when we do wake up writers, we do so using a sync wakeup
308 * (WF_SYNC), because we want them to get going and generate more
309 * data for us.
310 */
311 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
1da177e4 312 for (;;) {
8cefc107
DH
313 unsigned int head = pipe->head;
314 unsigned int tail = pipe->tail;
315 unsigned int mask = pipe->ring_size - 1;
316
e7d553d6
DH
317#ifdef CONFIG_WATCH_QUEUE
318 if (pipe->note_loss) {
319 struct watch_notification n;
320
321 if (total_len < 8) {
322 if (ret == 0)
323 ret = -ENOBUFS;
324 break;
325 }
326
327 n.type = WATCH_TYPE_META;
328 n.subtype = WATCH_META_LOSS_NOTIFICATION;
329 n.info = watch_sizeof(n);
330 if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
331 if (ret == 0)
332 ret = -EFAULT;
333 break;
334 }
335 ret += sizeof(n);
336 total_len -= sizeof(n);
337 pipe->note_loss = false;
338 }
339#endif
340
8cefc107
DH
341 if (!pipe_empty(head, tail)) {
342 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
1da177e4 343 size_t chars = buf->len;
637b58c2
AV
344 size_t written;
345 int error;
1da177e4 346
8cfba763
DH
347 if (chars > total_len) {
348 if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
349 if (ret == 0)
350 ret = -ENOBUFS;
351 break;
352 }
1da177e4 353 chars = total_len;
8cfba763 354 }
1da177e4 355
fba597db 356 error = pipe_buf_confirm(pipe, buf);
f84d7519 357 if (error) {
5274f052 358 if (!ret)
e5953cbd 359 ret = error;
5274f052
JA
360 break;
361 }
f84d7519 362
fb9096a3 363 written = copy_page_to_iter(buf->page, buf->offset, chars, to);
637b58c2 364 if (unlikely(written < chars)) {
341b446b 365 if (!ret)
637b58c2 366 ret = -EFAULT;
1da177e4
LT
367 break;
368 }
369 ret += chars;
370 buf->offset += chars;
371 buf->len -= chars;
9883035a
LT
372
373 /* Was it a packet buffer? Clean up and exit */
374 if (buf->flags & PIPE_BUF_FLAG_PACKET) {
375 total_len = chars;
376 buf->len = 0;
377 }
378
1da177e4 379 if (!buf->len) {
a779638c 380 pipe_buf_release(pipe, buf);
0ddad21d 381 spin_lock_irq(&pipe->rd_wait.lock);
e7d553d6
DH
382#ifdef CONFIG_WATCH_QUEUE
383 if (buf->flags & PIPE_BUF_FLAG_LOSS)
384 pipe->note_loss = true;
385#endif
8cefc107
DH
386 tail++;
387 pipe->tail = tail;
0ddad21d 388 spin_unlock_irq(&pipe->rd_wait.lock);
1da177e4
LT
389 }
390 total_len -= chars;
391 if (!total_len)
392 break; /* common path: read succeeded */
8cefc107
DH
393 if (!pipe_empty(head, tail)) /* More to do? */
394 continue;
1da177e4 395 }
8cefc107 396
923f4f23 397 if (!pipe->writers)
1da177e4 398 break;
a28c8b9d
LT
399 if (ret)
400 break;
401 if (filp->f_flags & O_NONBLOCK) {
402 ret = -EAGAIN;
403 break;
1da177e4 404 }
85190d15 405 __pipe_unlock(pipe);
d1c6a2aa
LT
406
407 /*
408 * We only get here if we didn't actually read anything.
409 *
410 * However, we could have seen (and removed) a zero-sized
411 * pipe buffer, and might have made space in the buffers
412 * that way.
413 *
414 * You can't make zero-sized pipe buffers by doing an empty
415 * write (not even in packet mode), but they can happen if
416 * the writer gets an EFAULT when trying to fill a buffer
417 * that already got allocated and inserted in the buffer
418 * array.
419 *
420 * So we still need to wake up any pending writers in the
421 * _very_ unlikely case that the pipe was full, but we got
422 * no data.
423 */
424 if (unlikely(was_full)) {
0ddad21d 425 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
f467a6a6
LT
426 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
427 }
d1c6a2aa
LT
428
429 /*
430 * But because we didn't read anything, at this point we can
431 * just return directly with -ERESTARTSYS if we're interrupted,
432 * since we've done any required wakeups and there's no need
433 * to mark anything accessed. And we've dropped the lock.
434 */
0ddad21d 435 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
d1c6a2aa
LT
436 return -ERESTARTSYS;
437
85190d15 438 __pipe_lock(pipe);
f467a6a6 439 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
0ddad21d 440 wake_next_reader = true;
1da177e4 441 }
0ddad21d
LT
442 if (pipe_empty(pipe->head, pipe->tail))
443 wake_next_reader = false;
ebec73f4 444 __pipe_unlock(pipe);
341b446b 445
f467a6a6 446 if (was_full) {
0ddad21d 447 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
923f4f23 448 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
1da177e4 449 }
0ddad21d
LT
450 if (wake_next_reader)
451 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
1da177e4
LT
452 if (ret > 0)
453 file_accessed(filp);
454 return ret;
455}
456
9883035a
LT
457static inline int is_packetized(struct file *file)
458{
459 return (file->f_flags & O_DIRECT) != 0;
460}
461
85190d15
LT
462/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
463static inline bool pipe_writable(const struct pipe_inode_info *pipe)
464{
465 unsigned int head = READ_ONCE(pipe->head);
466 unsigned int tail = READ_ONCE(pipe->tail);
467 unsigned int max_usage = READ_ONCE(pipe->max_usage);
468
469 return !pipe_full(head, tail, max_usage) ||
470 !READ_ONCE(pipe->readers);
471}
472
1da177e4 473static ssize_t
f0d1bec9 474pipe_write(struct kiocb *iocb, struct iov_iter *from)
1da177e4 475{
ee0b3e67 476 struct file *filp = iocb->ki_filp;
de32ec4c 477 struct pipe_inode_info *pipe = filp->private_data;
8f868d68 478 unsigned int head;
f0d1bec9 479 ssize_t ret = 0;
f0d1bec9 480 size_t total_len = iov_iter_count(from);
1da177e4 481 ssize_t chars;
1b6b26ae 482 bool was_empty = false;
0ddad21d 483 bool wake_next_writer = false;
1da177e4 484
1da177e4
LT
485 /* Null write succeeds. */
486 if (unlikely(total_len == 0))
487 return 0;
488
ebec73f4 489 __pipe_lock(pipe);
1da177e4 490
923f4f23 491 if (!pipe->readers) {
1da177e4
LT
492 send_sig(SIGPIPE, current, 0);
493 ret = -EPIPE;
494 goto out;
495 }
496
c73be61c
DH
497#ifdef CONFIG_WATCH_QUEUE
498 if (pipe->watch_queue) {
499 ret = -EXDEV;
500 goto out;
501 }
502#endif
503
1b6b26ae
LT
504 /*
505 * Only wake up if the pipe started out empty, since
506 * otherwise there should be no readers waiting.
507 *
508 * If it wasn't empty we try to merge new data into
509 * the last buffer.
510 *
511 * That naturally merges small writes, but it also
512 * page-aligs the rest of the writes for large writes
513 * spanning multiple pages.
514 */
8cefc107 515 head = pipe->head;
1b6b26ae
LT
516 was_empty = pipe_empty(head, pipe->tail);
517 chars = total_len & (PAGE_SIZE-1);
518 if (chars && !was_empty) {
8f868d68 519 unsigned int mask = pipe->ring_size - 1;
8cefc107 520 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
1da177e4 521 int offset = buf->offset + buf->len;
341b446b 522
01e7187b 523 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
fba597db 524 ret = pipe_buf_confirm(pipe, buf);
6ae08069 525 if (ret)
5274f052 526 goto out;
f84d7519 527
f0d1bec9
AV
528 ret = copy_page_from_iter(buf->page, offset, chars, from);
529 if (unlikely(ret < chars)) {
6ae08069 530 ret = -EFAULT;
1da177e4 531 goto out;
f6762b7a 532 }
1b6b26ae 533
6ae08069 534 buf->len += ret;
f0d1bec9 535 if (!iov_iter_count(from))
1da177e4
LT
536 goto out;
537 }
538 }
539
540 for (;;) {
923f4f23 541 if (!pipe->readers) {
1da177e4 542 send_sig(SIGPIPE, current, 0);
341b446b
IM
543 if (!ret)
544 ret = -EPIPE;
1da177e4
LT
545 break;
546 }
8cefc107 547
a194dfe6 548 head = pipe->head;
8f868d68
DH
549 if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
550 unsigned int mask = pipe->ring_size - 1;
8cefc107 551 struct pipe_buffer *buf = &pipe->bufs[head & mask];
923f4f23 552 struct page *page = pipe->tmp_page;
f0d1bec9 553 int copied;
1da177e4
LT
554
555 if (!page) {
d86133bd 556 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
1da177e4
LT
557 if (unlikely(!page)) {
558 ret = ret ? : -ENOMEM;
559 break;
560 }
923f4f23 561 pipe->tmp_page = page;
1da177e4 562 }
a194dfe6
DH
563
564 /* Allocate a slot in the ring in advance and attach an
565 * empty buffer. If we fault or otherwise fail to use
566 * it, either the reader will consume it or it'll still
567 * be there for the next write.
568 */
0ddad21d 569 spin_lock_irq(&pipe->rd_wait.lock);
a194dfe6
DH
570
571 head = pipe->head;
8f868d68 572 if (pipe_full(head, pipe->tail, pipe->max_usage)) {
0ddad21d 573 spin_unlock_irq(&pipe->rd_wait.lock);
8df44129
DH
574 continue;
575 }
576
a194dfe6 577 pipe->head = head + 1;
0ddad21d 578 spin_unlock_irq(&pipe->rd_wait.lock);
1da177e4
LT
579
580 /* Insert it into the buffer array */
a194dfe6 581 buf = &pipe->bufs[head & mask];
1da177e4
LT
582 buf->page = page;
583 buf->ops = &anon_pipe_buf_ops;
584 buf->offset = 0;
a194dfe6 585 buf->len = 0;
9883035a
LT
586 buf->flags = 0;
587 if (is_packetized(filp)) {
588 buf->ops = &packet_pipe_buf_ops;
589 buf->flags = PIPE_BUF_FLAG_PACKET;
590 }
923f4f23 591 pipe->tmp_page = NULL;
1da177e4 592
a194dfe6
DH
593 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
594 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
595 if (!ret)
596 ret = -EFAULT;
597 break;
598 }
599 ret += copied;
600 buf->offset = 0;
601 buf->len = copied;
602
f0d1bec9 603 if (!iov_iter_count(from))
1da177e4
LT
604 break;
605 }
8cefc107 606
8f868d68 607 if (!pipe_full(head, pipe->tail, pipe->max_usage))
1da177e4 608 continue;
8cefc107
DH
609
610 /* Wait for buffer space to become available. */
1da177e4 611 if (filp->f_flags & O_NONBLOCK) {
341b446b
IM
612 if (!ret)
613 ret = -EAGAIN;
1da177e4
LT
614 break;
615 }
616 if (signal_pending(current)) {
341b446b
IM
617 if (!ret)
618 ret = -ERESTARTSYS;
1da177e4
LT
619 break;
620 }
1b6b26ae
LT
621
622 /*
623 * We're going to release the pipe lock and wait for more
624 * space. We wake up any readers if necessary, and then
625 * after waiting we need to re-check whether the pipe
626 * become empty while we dropped the lock.
627 */
85190d15 628 __pipe_unlock(pipe);
1b6b26ae 629 if (was_empty) {
0ddad21d 630 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
1b6b26ae
LT
631 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
632 }
0ddad21d 633 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
85190d15 634 __pipe_lock(pipe);
0dd1e377 635 was_empty = pipe_empty(pipe->head, pipe->tail);
0ddad21d 636 wake_next_writer = true;
1da177e4
LT
637 }
638out:
0ddad21d
LT
639 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
640 wake_next_writer = false;
ebec73f4 641 __pipe_unlock(pipe);
1b6b26ae
LT
642
643 /*
644 * If we do do a wakeup event, we do a 'sync' wakeup, because we
645 * want the reader to start processing things asap, rather than
646 * leave the data pending.
647 *
648 * This is particularly important for small writes, because of
649 * how (for example) the GNU make jobserver uses small writes to
650 * wake up pending jobs
651 */
652 if (was_empty) {
0ddad21d 653 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
923f4f23 654 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1da177e4 655 }
0ddad21d
LT
656 if (wake_next_writer)
657 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
7e775f46 658 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
c3b2da31
JB
659 int err = file_update_time(filp);
660 if (err)
661 ret = err;
7e775f46 662 sb_end_write(file_inode(filp)->i_sb);
c3b2da31 663 }
1da177e4
LT
664 return ret;
665}
666
d59d0b1b 667static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1da177e4 668{
de32ec4c 669 struct pipe_inode_info *pipe = filp->private_data;
8cefc107 670 int count, head, tail, mask;
1da177e4
LT
671
672 switch (cmd) {
c73be61c
DH
673 case FIONREAD:
674 __pipe_lock(pipe);
675 count = 0;
676 head = pipe->head;
677 tail = pipe->tail;
678 mask = pipe->ring_size - 1;
8cefc107 679
c73be61c
DH
680 while (tail != head) {
681 count += pipe->bufs[tail & mask].len;
682 tail++;
683 }
684 __pipe_unlock(pipe);
923f4f23 685
c73be61c
DH
686 return put_user(count, (int __user *)arg);
687
688#ifdef CONFIG_WATCH_QUEUE
689 case IOC_WATCH_QUEUE_SET_SIZE: {
690 int ret;
691 __pipe_lock(pipe);
692 ret = watch_queue_set_size(pipe, arg);
693 __pipe_unlock(pipe);
694 return ret;
695 }
696
697 case IOC_WATCH_QUEUE_SET_FILTER:
698 return watch_queue_set_filter(
699 pipe, (struct watch_notification_filter __user *)arg);
700#endif
701
702 default:
703 return -ENOIOCTLCMD;
1da177e4
LT
704 }
705}
706
dd67081b 707/* No kernel lock held - fine */
a11e1d43
LT
708static __poll_t
709pipe_poll(struct file *filp, poll_table *wait)
dd67081b 710{
a11e1d43 711 __poll_t mask;
dd67081b 712 struct pipe_inode_info *pipe = filp->private_data;
ad910e36 713 unsigned int head, tail;
a11e1d43 714
ad910e36 715 /*
0ddad21d 716 * Reading pipe state only -- no need for acquiring the semaphore.
ad910e36
LT
717 *
718 * But because this is racy, the code has to add the
719 * entry to the poll table _first_ ..
720 */
0ddad21d
LT
721 if (filp->f_mode & FMODE_READ)
722 poll_wait(filp, &pipe->rd_wait, wait);
723 if (filp->f_mode & FMODE_WRITE)
724 poll_wait(filp, &pipe->wr_wait, wait);
1da177e4 725
ad910e36
LT
726 /*
727 * .. and only then can you do the racy tests. That way,
728 * if something changes and you got it wrong, the poll
729 * table entry will wake you up and fix it.
730 */
731 head = READ_ONCE(pipe->head);
732 tail = READ_ONCE(pipe->tail);
733
a11e1d43 734 mask = 0;
1da177e4 735 if (filp->f_mode & FMODE_READ) {
8cefc107
DH
736 if (!pipe_empty(head, tail))
737 mask |= EPOLLIN | EPOLLRDNORM;
923f4f23 738 if (!pipe->writers && filp->f_version != pipe->w_counter)
a9a08845 739 mask |= EPOLLHUP;
1da177e4
LT
740 }
741
742 if (filp->f_mode & FMODE_WRITE) {
6718b6f8 743 if (!pipe_full(head, tail, pipe->max_usage))
8cefc107 744 mask |= EPOLLOUT | EPOLLWRNORM;
5e5d7a22 745 /*
a9a08845 746 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
5e5d7a22
PE
747 * behave exactly like pipes for poll().
748 */
923f4f23 749 if (!pipe->readers)
a9a08845 750 mask |= EPOLLERR;
1da177e4
LT
751 }
752
753 return mask;
754}
755
b0d8d229
LT
756static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
757{
758 int kill = 0;
759
760 spin_lock(&inode->i_lock);
761 if (!--pipe->files) {
762 inode->i_pipe = NULL;
763 kill = 1;
764 }
765 spin_unlock(&inode->i_lock);
766
767 if (kill)
768 free_pipe_info(pipe);
769}
770
1da177e4 771static int
599a0ac1 772pipe_release(struct inode *inode, struct file *file)
1da177e4 773{
b0d8d229 774 struct pipe_inode_info *pipe = file->private_data;
923f4f23 775
ebec73f4 776 __pipe_lock(pipe);
599a0ac1
AV
777 if (file->f_mode & FMODE_READ)
778 pipe->readers--;
779 if (file->f_mode & FMODE_WRITE)
780 pipe->writers--;
341b446b 781
6551d5c5
LT
782 /* Was that the last reader or writer, but not the other side? */
783 if (!pipe->readers != !pipe->writers) {
784 wake_up_interruptible_all(&pipe->rd_wait);
785 wake_up_interruptible_all(&pipe->wr_wait);
923f4f23
IM
786 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
787 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
1da177e4 788 }
ebec73f4 789 __pipe_unlock(pipe);
ba5bb147 790
b0d8d229 791 put_pipe_info(inode, pipe);
1da177e4
LT
792 return 0;
793}
794
795static int
599a0ac1 796pipe_fasync(int fd, struct file *filp, int on)
1da177e4 797{
de32ec4c 798 struct pipe_inode_info *pipe = filp->private_data;
599a0ac1 799 int retval = 0;
1da177e4 800
ebec73f4 801 __pipe_lock(pipe);
599a0ac1
AV
802 if (filp->f_mode & FMODE_READ)
803 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
804 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
341b446b 805 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
599a0ac1
AV
806 if (retval < 0 && (filp->f_mode & FMODE_READ))
807 /* this can happen only if on == T */
e5bc49ba
ON
808 fasync_helper(-1, filp, 0, &pipe->fasync_readers);
809 }
ebec73f4 810 __pipe_unlock(pipe);
60aa4924 811 return retval;
1da177e4
LT
812}
813
c73be61c
DH
814unsigned long account_pipe_buffers(struct user_struct *user,
815 unsigned long old, unsigned long new)
759c0114 816{
9c87bcf0 817 return atomic_long_add_return(new - old, &user->pipe_bufs);
759c0114
WT
818}
819
c73be61c 820bool too_many_pipe_buffers_soft(unsigned long user_bufs)
759c0114 821{
f7340761
EB
822 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
823
824 return soft_limit && user_bufs > soft_limit;
759c0114
WT
825}
826
c73be61c 827bool too_many_pipe_buffers_hard(unsigned long user_bufs)
759c0114 828{
f7340761
EB
829 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
830
831 return hard_limit && user_bufs > hard_limit;
759c0114
WT
832}
833
c73be61c 834bool pipe_is_unprivileged_user(void)
85c2dd54
EB
835{
836 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
837}
838
7bee130e 839struct pipe_inode_info *alloc_pipe_info(void)
3a326a2c 840{
923f4f23 841 struct pipe_inode_info *pipe;
09b4d199
MK
842 unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
843 struct user_struct *user = get_current_user();
9c87bcf0 844 unsigned long user_bufs;
f7340761 845 unsigned int max_size = READ_ONCE(pipe_max_size);
3a326a2c 846
d86133bd 847 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
09b4d199
MK
848 if (pipe == NULL)
849 goto out_free_uid;
850
f7340761
EB
851 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
852 pipe_bufs = max_size >> PAGE_SHIFT;
086e774a 853
9c87bcf0 854 user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
a005ca0e 855
c73be61c 856 if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
9c87bcf0 857 user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
a005ca0e 858 pipe_bufs = 1;
09b4d199 859 }
759c0114 860
c73be61c 861 if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
a005ca0e
MK
862 goto out_revert_acct;
863
864 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
865 GFP_KERNEL_ACCOUNT);
866
09b4d199 867 if (pipe->bufs) {
0ddad21d
LT
868 init_waitqueue_head(&pipe->rd_wait);
869 init_waitqueue_head(&pipe->wr_wait);
09b4d199 870 pipe->r_counter = pipe->w_counter = 1;
6718b6f8 871 pipe->max_usage = pipe_bufs;
8cefc107 872 pipe->ring_size = pipe_bufs;
c73be61c 873 pipe->nr_accounted = pipe_bufs;
09b4d199 874 pipe->user = user;
09b4d199
MK
875 mutex_init(&pipe->mutex);
876 return pipe;
3a326a2c
IM
877 }
878
a005ca0e 879out_revert_acct:
9c87bcf0 880 (void) account_pipe_buffers(user, pipe_bufs, 0);
09b4d199
MK
881 kfree(pipe);
882out_free_uid:
883 free_uid(user);
35f3d14d 884 return NULL;
3a326a2c
IM
885}
886
4b8a8f1e 887void free_pipe_info(struct pipe_inode_info *pipe)
1da177e4
LT
888{
889 int i;
1da177e4 890
c73be61c
DH
891#ifdef CONFIG_WATCH_QUEUE
892 if (pipe->watch_queue) {
893 watch_queue_clear(pipe->watch_queue);
894 put_watch_queue(pipe->watch_queue);
895 }
896#endif
897
898 (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
759c0114 899 free_uid(pipe->user);
8cefc107 900 for (i = 0; i < pipe->ring_size; i++) {
923f4f23 901 struct pipe_buffer *buf = pipe->bufs + i;
1da177e4 902 if (buf->ops)
a779638c 903 pipe_buf_release(pipe, buf);
1da177e4 904 }
923f4f23
IM
905 if (pipe->tmp_page)
906 __free_page(pipe->tmp_page);
35f3d14d 907 kfree(pipe->bufs);
923f4f23 908 kfree(pipe);
1da177e4
LT
909}
910
fa3536cc 911static struct vfsmount *pipe_mnt __read_mostly;
341b446b 912
c23fbb6b
ED
913/*
914 * pipefs_dname() is called from d_path().
915 */
916static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
917{
918 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
75c3cfa8 919 d_inode(dentry)->i_ino);
c23fbb6b
ED
920}
921
3ba13d17 922static const struct dentry_operations pipefs_dentry_operations = {
c23fbb6b 923 .d_dname = pipefs_dname,
1da177e4
LT
924};
925
926static struct inode * get_pipe_inode(void)
927{
a209dfc7 928 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
923f4f23 929 struct pipe_inode_info *pipe;
1da177e4
LT
930
931 if (!inode)
932 goto fail_inode;
933
85fe4025
CH
934 inode->i_ino = get_next_ino();
935
7bee130e 936 pipe = alloc_pipe_info();
923f4f23 937 if (!pipe)
1da177e4 938 goto fail_iput;
3a326a2c 939
ba5bb147
AV
940 inode->i_pipe = pipe;
941 pipe->files = 2;
923f4f23 942 pipe->readers = pipe->writers = 1;
599a0ac1 943 inode->i_fop = &pipefifo_fops;
1da177e4
LT
944
945 /*
946 * Mark the inode dirty from the very beginning,
947 * that way it will never be moved to the dirty
948 * list because "mark_inode_dirty()" will think
949 * that it already _is_ on the dirty list.
950 */
951 inode->i_state = I_DIRTY;
952 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
da9592ed
DH
953 inode->i_uid = current_fsuid();
954 inode->i_gid = current_fsgid();
078cd827 955 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
923f4f23 956
1da177e4
LT
957 return inode;
958
959fail_iput:
960 iput(inode);
341b446b 961
1da177e4
LT
962fail_inode:
963 return NULL;
964}
965
e4fad8e5 966int create_pipe_files(struct file **res, int flags)
1da177e4 967{
e4fad8e5 968 struct inode *inode = get_pipe_inode();
d6cbd281 969 struct file *f;
1da177e4 970
1da177e4 971 if (!inode)
e4fad8e5 972 return -ENFILE;
1da177e4 973
c73be61c
DH
974 if (flags & O_NOTIFICATION_PIPE) {
975#ifdef CONFIG_WATCH_QUEUE
976 if (watch_queue_init(inode->i_pipe) < 0) {
977 iput(inode);
978 return -ENOMEM;
979 }
980#else
981 return -ENOPKG;
982#endif
983 }
984
152b6372
AV
985 f = alloc_file_pseudo(inode, pipe_mnt, "",
986 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
987 &pipefifo_fops);
e9bb1f9b 988 if (IS_ERR(f)) {
152b6372
AV
989 free_pipe_info(inode->i_pipe);
990 iput(inode);
991 return PTR_ERR(f);
e9bb1f9b 992 }
341b446b 993
de32ec4c 994 f->private_data = inode->i_pipe;
d6cbd281 995
183266f2
AV
996 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
997 &pipefifo_fops);
e9bb1f9b 998 if (IS_ERR(res[0])) {
b10a4a9f
AV
999 put_pipe_info(inode, inode->i_pipe);
1000 fput(f);
1001 return PTR_ERR(res[0]);
e9bb1f9b 1002 }
de32ec4c 1003 res[0]->private_data = inode->i_pipe;
e4fad8e5 1004 res[1] = f;
d8e464ec
LT
1005 stream_open(inode, res[0]);
1006 stream_open(inode, res[1]);
e4fad8e5 1007 return 0;
d6cbd281
AK
1008}
1009
5b249b1b 1010static int __do_pipe_flags(int *fd, struct file **files, int flags)
d6cbd281 1011{
d6cbd281
AK
1012 int error;
1013 int fdw, fdr;
1014
c73be61c 1015 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
ed8cae8b
UD
1016 return -EINVAL;
1017
e4fad8e5
AV
1018 error = create_pipe_files(files, flags);
1019 if (error)
1020 return error;
d6cbd281 1021
ed8cae8b 1022 error = get_unused_fd_flags(flags);
d6cbd281
AK
1023 if (error < 0)
1024 goto err_read_pipe;
1025 fdr = error;
1026
ed8cae8b 1027 error = get_unused_fd_flags(flags);
d6cbd281
AK
1028 if (error < 0)
1029 goto err_fdr;
1030 fdw = error;
1031
157cf649 1032 audit_fd_pair(fdr, fdw);
d6cbd281
AK
1033 fd[0] = fdr;
1034 fd[1] = fdw;
d6cbd281
AK
1035 return 0;
1036
1037 err_fdr:
1038 put_unused_fd(fdr);
1039 err_read_pipe:
e4fad8e5
AV
1040 fput(files[0]);
1041 fput(files[1]);
d6cbd281 1042 return error;
1da177e4
LT
1043}
1044
5b249b1b
AV
1045int do_pipe_flags(int *fd, int flags)
1046{
1047 struct file *files[2];
1048 int error = __do_pipe_flags(fd, files, flags);
1049 if (!error) {
1050 fd_install(fd[0], files[0]);
1051 fd_install(fd[1], files[1]);
1052 }
1053 return error;
1054}
1055
d35c7b0e
UD
1056/*
1057 * sys_pipe() is the normal C calling standard for creating
1058 * a pipe. It's not the way Unix traditionally does this, though.
1059 */
0a216dd1 1060static int do_pipe2(int __user *fildes, int flags)
d35c7b0e 1061{
5b249b1b 1062 struct file *files[2];
d35c7b0e
UD
1063 int fd[2];
1064 int error;
1065
5b249b1b 1066 error = __do_pipe_flags(fd, files, flags);
d35c7b0e 1067 if (!error) {
5b249b1b
AV
1068 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
1069 fput(files[0]);
1070 fput(files[1]);
1071 put_unused_fd(fd[0]);
1072 put_unused_fd(fd[1]);
d35c7b0e 1073 error = -EFAULT;
5b249b1b
AV
1074 } else {
1075 fd_install(fd[0], files[0]);
1076 fd_install(fd[1], files[1]);
ba719bae 1077 }
d35c7b0e
UD
1078 }
1079 return error;
1080}
1081
0a216dd1
DB
1082SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1083{
1084 return do_pipe2(fildes, flags);
1085}
1086
2b664219 1087SYSCALL_DEFINE1(pipe, int __user *, fildes)
ed8cae8b 1088{
0a216dd1 1089 return do_pipe2(fildes, 0);
ed8cae8b
UD
1090}
1091
fc7478a2 1092static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
f776c738 1093{
8cefc107 1094 int cur = *cnt;
f776c738
AV
1095
1096 while (cur == *cnt) {
fc7478a2 1097 pipe_wait(pipe);
f776c738
AV
1098 if (signal_pending(current))
1099 break;
1100 }
1101 return cur == *cnt ? -ERESTARTSYS : 0;
1102}
1103
fc7478a2 1104static void wake_up_partner(struct pipe_inode_info *pipe)
f776c738 1105{
6551d5c5
LT
1106 wake_up_interruptible_all(&pipe->rd_wait);
1107 wake_up_interruptible_all(&pipe->wr_wait);
f776c738
AV
1108}
1109
1110static int fifo_open(struct inode *inode, struct file *filp)
1111{
1112 struct pipe_inode_info *pipe;
599a0ac1 1113 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
f776c738
AV
1114 int ret;
1115
ba5bb147
AV
1116 filp->f_version = 0;
1117
1118 spin_lock(&inode->i_lock);
1119 if (inode->i_pipe) {
1120 pipe = inode->i_pipe;
1121 pipe->files++;
1122 spin_unlock(&inode->i_lock);
1123 } else {
1124 spin_unlock(&inode->i_lock);
7bee130e 1125 pipe = alloc_pipe_info();
f776c738 1126 if (!pipe)
ba5bb147
AV
1127 return -ENOMEM;
1128 pipe->files = 1;
1129 spin_lock(&inode->i_lock);
1130 if (unlikely(inode->i_pipe)) {
1131 inode->i_pipe->files++;
1132 spin_unlock(&inode->i_lock);
4b8a8f1e 1133 free_pipe_info(pipe);
ba5bb147
AV
1134 pipe = inode->i_pipe;
1135 } else {
1136 inode->i_pipe = pipe;
1137 spin_unlock(&inode->i_lock);
1138 }
f776c738 1139 }
de32ec4c 1140 filp->private_data = pipe;
ba5bb147
AV
1141 /* OK, we have a pipe and it's pinned down */
1142
ebec73f4 1143 __pipe_lock(pipe);
f776c738
AV
1144
1145 /* We can only do regular read/write on fifos */
d8e464ec 1146 stream_open(inode, filp);
f776c738 1147
d8e464ec 1148 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
f776c738
AV
1149 case FMODE_READ:
1150 /*
1151 * O_RDONLY
1152 * POSIX.1 says that O_NONBLOCK means return with the FIFO
1153 * opened, even when there is no process writing the FIFO.
1154 */
f776c738
AV
1155 pipe->r_counter++;
1156 if (pipe->readers++ == 0)
fc7478a2 1157 wake_up_partner(pipe);
f776c738 1158
599a0ac1 1159 if (!is_pipe && !pipe->writers) {
f776c738 1160 if ((filp->f_flags & O_NONBLOCK)) {
a9a08845 1161 /* suppress EPOLLHUP until we have
f776c738
AV
1162 * seen a writer */
1163 filp->f_version = pipe->w_counter;
1164 } else {
fc7478a2 1165 if (wait_for_partner(pipe, &pipe->w_counter))
f776c738
AV
1166 goto err_rd;
1167 }
1168 }
1169 break;
8cefc107 1170
f776c738
AV
1171 case FMODE_WRITE:
1172 /*
1173 * O_WRONLY
1174 * POSIX.1 says that O_NONBLOCK means return -1 with
1175 * errno=ENXIO when there is no process reading the FIFO.
1176 */
1177 ret = -ENXIO;
599a0ac1 1178 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
f776c738
AV
1179 goto err;
1180
f776c738
AV
1181 pipe->w_counter++;
1182 if (!pipe->writers++)
fc7478a2 1183 wake_up_partner(pipe);
f776c738 1184
599a0ac1 1185 if (!is_pipe && !pipe->readers) {
fc7478a2 1186 if (wait_for_partner(pipe, &pipe->r_counter))
f776c738
AV
1187 goto err_wr;
1188 }
1189 break;
8cefc107 1190
f776c738
AV
1191 case FMODE_READ | FMODE_WRITE:
1192 /*
1193 * O_RDWR
1194 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1195 * This implementation will NEVER block on a O_RDWR open, since
1196 * the process can at least talk to itself.
1197 */
f776c738
AV
1198
1199 pipe->readers++;
1200 pipe->writers++;
1201 pipe->r_counter++;
1202 pipe->w_counter++;
1203 if (pipe->readers == 1 || pipe->writers == 1)
fc7478a2 1204 wake_up_partner(pipe);
f776c738
AV
1205 break;
1206
1207 default:
1208 ret = -EINVAL;
1209 goto err;
1210 }
1211
1212 /* Ok! */
ebec73f4 1213 __pipe_unlock(pipe);
f776c738
AV
1214 return 0;
1215
1216err_rd:
1217 if (!--pipe->readers)
0ddad21d 1218 wake_up_interruptible(&pipe->wr_wait);
f776c738
AV
1219 ret = -ERESTARTSYS;
1220 goto err;
1221
1222err_wr:
1223 if (!--pipe->writers)
6551d5c5 1224 wake_up_interruptible_all(&pipe->rd_wait);
f776c738
AV
1225 ret = -ERESTARTSYS;
1226 goto err;
1227
1228err:
ebec73f4 1229 __pipe_unlock(pipe);
b0d8d229
LT
1230
1231 put_pipe_info(inode, pipe);
f776c738
AV
1232 return ret;
1233}
1234
599a0ac1
AV
1235const struct file_operations pipefifo_fops = {
1236 .open = fifo_open,
1237 .llseek = no_llseek,
fb9096a3 1238 .read_iter = pipe_read,
f0d1bec9 1239 .write_iter = pipe_write,
a11e1d43 1240 .poll = pipe_poll,
599a0ac1
AV
1241 .unlocked_ioctl = pipe_ioctl,
1242 .release = pipe_release,
1243 .fasync = pipe_fasync,
f776c738
AV
1244};
1245
f491bd71
MK
1246/*
1247 * Currently we rely on the pipe array holding a power-of-2 number
d3f14c48 1248 * of pages. Returns 0 on error.
f491bd71 1249 */
96e99be4 1250unsigned int round_pipe_size(unsigned long size)
f491bd71 1251{
c4fed5a9 1252 if (size > (1U << 31))
96e99be4
EB
1253 return 0;
1254
4c2e4bef
EB
1255 /* Minimum pipe size, as required by POSIX */
1256 if (size < PAGE_SIZE)
c4fed5a9 1257 return PAGE_SIZE;
d3f14c48 1258
c4fed5a9 1259 return roundup_pow_of_two(size);
f491bd71
MK
1260}
1261
35f3d14d 1262/*
c73be61c 1263 * Resize the pipe ring to a number of slots.
35f3d14d 1264 */
c73be61c 1265int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
35f3d14d
JA
1266{
1267 struct pipe_buffer *bufs;
c73be61c 1268 unsigned int head, tail, mask, n;
35f3d14d 1269
35f3d14d 1270 /*
8cefc107
DH
1271 * We can shrink the pipe, if arg is greater than the ring occupancy.
1272 * Since we don't expect a lot of shrink+grow operations, just free and
1273 * allocate again like we would do for growing. If the pipe currently
35f3d14d
JA
1274 * contains more buffers than arg, then return busy.
1275 */
8cefc107
DH
1276 mask = pipe->ring_size - 1;
1277 head = pipe->head;
1278 tail = pipe->tail;
1279 n = pipe_occupancy(pipe->head, pipe->tail);
c73be61c
DH
1280 if (nr_slots < n)
1281 return -EBUSY;
35f3d14d 1282
8cefc107 1283 bufs = kcalloc(nr_slots, sizeof(*bufs),
d86133bd 1284 GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
c73be61c
DH
1285 if (unlikely(!bufs))
1286 return -ENOMEM;
35f3d14d
JA
1287
1288 /*
1289 * The pipe array wraps around, so just start the new one at zero
8cefc107 1290 * and adjust the indices.
35f3d14d 1291 */
8cefc107
DH
1292 if (n > 0) {
1293 unsigned int h = head & mask;
1294 unsigned int t = tail & mask;
1295 if (h > t) {
1296 memcpy(bufs, pipe->bufs + t,
1297 n * sizeof(struct pipe_buffer));
1298 } else {
1299 unsigned int tsize = pipe->ring_size - t;
1300 if (h > 0)
1301 memcpy(bufs + tsize, pipe->bufs,
1302 h * sizeof(struct pipe_buffer));
1303 memcpy(bufs, pipe->bufs + t,
1304 tsize * sizeof(struct pipe_buffer));
1305 }
35f3d14d
JA
1306 }
1307
8cefc107
DH
1308 head = n;
1309 tail = 0;
1310
35f3d14d
JA
1311 kfree(pipe->bufs);
1312 pipe->bufs = bufs;
8cefc107 1313 pipe->ring_size = nr_slots;
c73be61c
DH
1314 if (pipe->max_usage > nr_slots)
1315 pipe->max_usage = nr_slots;
8cefc107
DH
1316 pipe->tail = tail;
1317 pipe->head = head;
6551d5c5
LT
1318
1319 /* This might have made more room for writers */
1320 wake_up_interruptible(&pipe->wr_wait);
c73be61c
DH
1321 return 0;
1322}
1323
1324/*
1325 * Allocate a new array of pipe buffers and copy the info over. Returns the
1326 * pipe size if successful, or return -ERROR on error.
1327 */
1328static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1329{
1330 unsigned long user_bufs;
1331 unsigned int nr_slots, size;
1332 long ret = 0;
1333
1334#ifdef CONFIG_WATCH_QUEUE
1335 if (pipe->watch_queue)
1336 return -EBUSY;
1337#endif
1338
1339 size = round_pipe_size(arg);
1340 nr_slots = size >> PAGE_SHIFT;
1341
1342 if (!nr_slots)
1343 return -EINVAL;
1344
1345 /*
1346 * If trying to increase the pipe capacity, check that an
1347 * unprivileged user is not trying to exceed various limits
1348 * (soft limit check here, hard limit check just below).
1349 * Decreasing the pipe capacity is always permitted, even
1350 * if the user is currently over a limit.
1351 */
1352 if (nr_slots > pipe->max_usage &&
1353 size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1354 return -EPERM;
1355
1356 user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
1357
1358 if (nr_slots > pipe->max_usage &&
1359 (too_many_pipe_buffers_hard(user_bufs) ||
1360 too_many_pipe_buffers_soft(user_bufs)) &&
1361 pipe_is_unprivileged_user()) {
1362 ret = -EPERM;
1363 goto out_revert_acct;
1364 }
1365
1366 ret = pipe_resize_ring(pipe, nr_slots);
1367 if (ret < 0)
1368 goto out_revert_acct;
1369
1370 pipe->max_usage = nr_slots;
1371 pipe->nr_accounted = nr_slots;
6718b6f8 1372 return pipe->max_usage * PAGE_SIZE;
b0b91d18
MK
1373
1374out_revert_acct:
c73be61c 1375 (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
b0b91d18 1376 return ret;
35f3d14d
JA
1377}
1378
72083646
LT
1379/*
1380 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1381 * location, so checking ->i_pipe is not enough to verify that this is a
1382 * pipe.
1383 */
c73be61c 1384struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
72083646 1385{
c73be61c
DH
1386 struct pipe_inode_info *pipe = file->private_data;
1387
1388 if (file->f_op != &pipefifo_fops || !pipe)
1389 return NULL;
1390#ifdef CONFIG_WATCH_QUEUE
1391 if (for_splice && pipe->watch_queue)
1392 return NULL;
1393#endif
1394 return pipe;
72083646
LT
1395}
1396
35f3d14d
JA
1397long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1398{
1399 struct pipe_inode_info *pipe;
1400 long ret;
1401
c73be61c 1402 pipe = get_pipe_info(file, false);
35f3d14d
JA
1403 if (!pipe)
1404 return -EBADF;
1405
ebec73f4 1406 __pipe_lock(pipe);
35f3d14d
JA
1407
1408 switch (cmd) {
d37d4166
MK
1409 case F_SETPIPE_SZ:
1410 ret = pipe_set_size(pipe, arg);
35f3d14d
JA
1411 break;
1412 case F_GETPIPE_SZ:
6718b6f8 1413 ret = pipe->max_usage * PAGE_SIZE;
35f3d14d
JA
1414 break;
1415 default:
1416 ret = -EINVAL;
1417 break;
1418 }
1419
ebec73f4 1420 __pipe_unlock(pipe);
35f3d14d
JA
1421 return ret;
1422}
1423
ff0c7d15
NP
1424static const struct super_operations pipefs_ops = {
1425 .destroy_inode = free_inode_nonrcu,
d70ef97b 1426 .statfs = simple_statfs,
ff0c7d15
NP
1427};
1428
1da177e4
LT
1429/*
1430 * pipefs should _never_ be mounted by userland - too much of security hassle,
1431 * no real gain from having the whole whorehouse mounted. So we don't need
1432 * any operations on the root directory. However, we need a non-trivial
1433 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1434 */
4fa7ec5d
DH
1435
1436static int pipefs_init_fs_context(struct fs_context *fc)
1da177e4 1437{
4fa7ec5d
DH
1438 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1439 if (!ctx)
1440 return -ENOMEM;
1441 ctx->ops = &pipefs_ops;
1442 ctx->dops = &pipefs_dentry_operations;
1443 return 0;
1da177e4
LT
1444}
1445
1446static struct file_system_type pipe_fs_type = {
1447 .name = "pipefs",
4fa7ec5d 1448 .init_fs_context = pipefs_init_fs_context,
1da177e4
LT
1449 .kill_sb = kill_anon_super,
1450};
1451
1452static int __init init_pipe_fs(void)
1453{
1454 int err = register_filesystem(&pipe_fs_type);
341b446b 1455
1da177e4
LT
1456 if (!err) {
1457 pipe_mnt = kern_mount(&pipe_fs_type);
1458 if (IS_ERR(pipe_mnt)) {
1459 err = PTR_ERR(pipe_mnt);
1460 unregister_filesystem(&pipe_fs_type);
1461 }
1462 }
1463 return err;
1464}
1465
1da177e4 1466fs_initcall(init_pipe_fs);
This page took 1.324546 seconds and 4 git commands to generate.