1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem high-level (buffered) writeback.
4 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
8 * To support network filesystems with local caching, we manage a situation
9 * that can be envisioned like the following:
11 * +---+---+-----+-----+---+----------+
12 * Folios: | | | | | | |
13 * +---+---+-----+-----+---+----------+
15 * +------+------+ +----+----+
16 * Upload: | | |.....| | |
17 * (Stream 0) +------+------+ +----+----+
19 * +------+------+------+------+------+
21 * (Stream 1) +------+------+------+------+------+
23 * Where we have a sequence of folios of varying sizes that we need to overlay
24 * with multiple parallel streams of I/O requests, where the I/O requests in a
25 * stream may also be of various sizes (in cifs, for example, the sizes are
26 * negotiated with the server; in something like ceph, they may represent the
27 * sizes of storage objects).
29 * The sequence in each stream may contain gaps and noncontiguous subrequests
30 * may be glued together into single vectored write RPCs.
33 #include <linux/export.h>
36 #include <linux/pagemap.h>
40 * Kill all dirty folios in the event of an unrecoverable error, starting with
41 * a locked folio we've already obtained from writeback_iter().
43 static void netfs_kill_dirty_pages(struct address_space *mapping,
44 struct writeback_control *wbc,
50 enum netfs_folio_trace why = netfs_folio_trace_kill;
51 struct netfs_group *group = NULL;
52 struct netfs_folio *finfo = NULL;
55 priv = folio_detach_private(folio);
57 finfo = __netfs_folio_info(priv);
59 /* Kill folio from streaming write. */
60 group = finfo->netfs_group;
61 why = netfs_folio_trace_kill_s;
64 if (group == NETFS_FOLIO_COPY_TO_CACHE) {
65 /* Kill copy-to-cache folio */
66 why = netfs_folio_trace_kill_cc;
69 /* Kill folio with group */
70 why = netfs_folio_trace_kill_g;
75 trace_netfs_folio(folio, why);
77 folio_start_writeback(folio);
79 folio_end_writeback(folio);
81 netfs_put_group(group);
84 } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
88 * Create a write request and set it up appropriately for the origin type.
90 struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
93 enum netfs_io_origin origin)
95 struct netfs_io_request *wreq;
96 struct netfs_inode *ictx;
98 wreq = netfs_alloc_request(mapping, file, start, 0, origin);
102 _enter("R=%x", wreq->debug_id);
104 ictx = netfs_inode(wreq->inode);
105 if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
106 fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
108 wreq->contiguity = wreq->start;
109 wreq->cleaned_to = wreq->start;
110 INIT_WORK(&wreq->work, netfs_write_collection_worker);
112 wreq->io_streams[0].stream_nr = 0;
113 wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER;
114 wreq->io_streams[0].prepare_write = ictx->ops->prepare_write;
115 wreq->io_streams[0].issue_write = ictx->ops->issue_write;
116 wreq->io_streams[0].collected_to = start;
117 wreq->io_streams[0].transferred = LONG_MAX;
119 wreq->io_streams[1].stream_nr = 1;
120 wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE;
121 wreq->io_streams[1].collected_to = start;
122 wreq->io_streams[1].transferred = LONG_MAX;
123 if (fscache_resources_valid(&wreq->cache_resources)) {
124 wreq->io_streams[1].avail = true;
125 wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;
126 wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;
133 * netfs_prepare_write_failed - Note write preparation failed
134 * @subreq: The subrequest to mark
136 * Mark a subrequest to note that preparation for write failed.
138 void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)
140 __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
141 trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);
143 EXPORT_SYMBOL(netfs_prepare_write_failed);
146 * Prepare a write subrequest. We need to allocate a new subrequest
147 * if we don't have one.
149 static void netfs_prepare_write(struct netfs_io_request *wreq,
150 struct netfs_io_stream *stream,
153 struct netfs_io_subrequest *subreq;
155 subreq = netfs_alloc_subrequest(wreq);
156 subreq->source = stream->source;
157 subreq->start = start;
158 subreq->max_len = ULONG_MAX;
159 subreq->max_nr_segs = INT_MAX;
160 subreq->stream_nr = stream->stream_nr;
162 _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
164 trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
165 refcount_read(&subreq->ref),
166 netfs_sreq_trace_new);
168 trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
170 switch (stream->source) {
171 case NETFS_UPLOAD_TO_SERVER:
172 netfs_stat(&netfs_n_wh_upload);
173 subreq->max_len = wreq->wsize;
175 case NETFS_WRITE_TO_CACHE:
176 netfs_stat(&netfs_n_wh_write);
183 if (stream->prepare_write)
184 stream->prepare_write(subreq);
186 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
188 /* We add to the end of the list whilst the collector may be walking
189 * the list. The collector only goes nextwards and uses the lock to
190 * remove entries off of the front.
192 spin_lock(&wreq->lock);
193 list_add_tail(&subreq->rreq_link, &stream->subrequests);
194 if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
195 stream->front = subreq;
196 if (!stream->active) {
197 stream->collected_to = stream->front->start;
198 /* Write list pointers before active flag */
199 smp_store_release(&stream->active, true);
203 spin_unlock(&wreq->lock);
205 stream->construct = subreq;
209 * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
210 * operation. The operation may be asynchronous and should call
211 * netfs_write_subrequest_terminated() when complete.
213 static void netfs_do_issue_write(struct netfs_io_stream *stream,
214 struct netfs_io_subrequest *subreq)
216 struct netfs_io_request *wreq = subreq->rreq;
218 _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
220 if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
221 return netfs_write_subrequest_terminated(subreq, subreq->error, false);
223 // TODO: Use encrypted buffer
224 if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) {
225 subreq->io_iter = wreq->io_iter;
226 iov_iter_advance(&subreq->io_iter,
227 subreq->start + subreq->transferred - wreq->start);
228 iov_iter_truncate(&subreq->io_iter,
229 subreq->len - subreq->transferred);
231 iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages,
232 subreq->start + subreq->transferred,
233 subreq->len - subreq->transferred);
236 trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
237 stream->issue_write(subreq);
240 void netfs_reissue_write(struct netfs_io_stream *stream,
241 struct netfs_io_subrequest *subreq)
243 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
244 netfs_do_issue_write(stream, subreq);
247 static void netfs_issue_write(struct netfs_io_request *wreq,
248 struct netfs_io_stream *stream)
250 struct netfs_io_subrequest *subreq = stream->construct;
254 stream->construct = NULL;
256 if (subreq->start + subreq->len > wreq->start + wreq->submitted)
257 wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start;
258 netfs_do_issue_write(stream, subreq);
262 * Add data to the write subrequest, dispatching each as we fill it up or if it
263 * is discontiguous with the previous. We only fill one part at a time so that
264 * we can avoid overrunning the credits obtained (cifs) and try to parallelise
265 * content-crypto preparation with network writes.
267 int netfs_advance_write(struct netfs_io_request *wreq,
268 struct netfs_io_stream *stream,
269 loff_t start, size_t len, bool to_eof)
271 struct netfs_io_subrequest *subreq = stream->construct;
274 if (!stream->avail) {
279 _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
281 if (subreq && start != subreq->start + subreq->len) {
282 netfs_issue_write(wreq, stream);
286 if (!stream->construct)
287 netfs_prepare_write(wreq, stream, start);
288 subreq = stream->construct;
290 part = min(subreq->max_len - subreq->len, len);
291 _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
295 if (subreq->len >= subreq->max_len ||
296 subreq->nr_segs >= subreq->max_nr_segs ||
298 netfs_issue_write(wreq, stream);
306 * Write some of a pending folio data back to the server.
308 static int netfs_write_folio(struct netfs_io_request *wreq,
309 struct writeback_control *wbc,
312 struct netfs_io_stream *upload = &wreq->io_streams[0];
313 struct netfs_io_stream *cache = &wreq->io_streams[1];
314 struct netfs_io_stream *stream;
315 struct netfs_group *fgroup; /* TODO: Use this with ceph */
316 struct netfs_folio *finfo;
317 size_t fsize = folio_size(folio), flen = fsize, foff = 0;
318 loff_t fpos = folio_pos(folio), i_size;
319 bool to_eof = false, streamw = false;
324 /* netfs_perform_write() may shift i_size around the page or from out
325 * of the page to beyond it, but cannot move i_size into or through the
326 * page since we have it locked.
328 i_size = i_size_read(wreq->inode);
330 if (fpos >= i_size) {
331 /* mmap beyond eof. */
332 _debug("beyond eof");
333 folio_start_writeback(folio);
335 wreq->nr_group_rel += netfs_folio_written_back(folio);
336 netfs_put_group_many(wreq->group, wreq->nr_group_rel);
337 wreq->nr_group_rel = 0;
341 if (fpos + fsize > wreq->i_size)
342 wreq->i_size = i_size;
344 fgroup = netfs_folio_group(folio);
345 finfo = netfs_folio_info(folio);
347 foff = finfo->dirty_offset;
348 flen = foff + finfo->dirty_len;
352 if (wreq->origin == NETFS_WRITETHROUGH) {
354 if (flen > i_size - fpos)
355 flen = i_size - fpos;
356 } else if (flen > i_size - fpos) {
357 flen = i_size - fpos;
359 folio_zero_segment(folio, flen, fsize);
361 } else if (flen == i_size - fpos) {
366 _debug("folio %zx %zx %zx", foff, flen, fsize);
368 /* Deal with discontinuities in the stream of dirty pages. These can
369 * arise from a number of sources:
371 * (1) Intervening non-dirty pages from random-access writes, multiple
372 * flushers writing back different parts simultaneously and manual
375 * (2) Partially-written pages from write-streaming.
377 * (3) Pages that belong to a different write-back group (eg. Ceph
380 * (4) Actually-clean pages that were marked for write to the cache
381 * when they were read. Note that these appear as a special
384 if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
385 netfs_issue_write(wreq, upload);
386 } else if (fgroup != wreq->group) {
387 /* We can't write this page to the server yet. */
388 kdebug("wrong group");
389 folio_redirty_for_writepage(wbc, folio);
391 netfs_issue_write(wreq, upload);
392 netfs_issue_write(wreq, cache);
397 netfs_issue_write(wreq, upload);
399 netfs_issue_write(wreq, cache);
401 /* Flip the page to the writeback state and unlock. If we're called
402 * from write-through, then the page has already been put into the wb
405 if (wreq->origin == NETFS_WRITEBACK)
406 folio_start_writeback(folio);
409 if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
410 if (!fscache_resources_valid(&wreq->cache_resources)) {
411 trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
412 netfs_issue_write(wreq, upload);
413 netfs_folio_written_back(folio);
416 trace_netfs_folio(folio, netfs_folio_trace_store_copy);
417 } else if (!upload->construct) {
418 trace_netfs_folio(folio, netfs_folio_trace_store);
420 trace_netfs_folio(folio, netfs_folio_trace_store_plus);
423 /* Move the submission point forward to allow for write-streaming data
424 * not starting at the front of the page. We don't do write-streaming
425 * with the cache as the cache requires DIO alignment.
427 * Also skip uploading for data that's been read and just needs copying
430 for (int s = 0; s < NR_IO_STREAMS; s++) {
431 stream = &wreq->io_streams[s];
432 stream->submit_max_len = fsize;
433 stream->submit_off = foff;
434 stream->submit_len = flen;
435 if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
436 (stream->source == NETFS_UPLOAD_TO_SERVER &&
437 fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
438 stream->submit_off = UINT_MAX;
439 stream->submit_len = 0;
440 stream->submit_max_len = 0;
444 /* Attach the folio to one or more subrequests. For a big folio, we
445 * could end up with thousands of subrequests if the wsize is small -
446 * but we might need to wait during the creation of subrequests for
447 * network resources (eg. SMB credits).
451 size_t lowest_off = ULONG_MAX;
454 /* Always add to the lowest-submitted stream first. */
455 for (int s = 0; s < NR_IO_STREAMS; s++) {
456 stream = &wreq->io_streams[s];
457 if (stream->submit_len > 0 &&
458 stream->submit_off < lowest_off) {
459 lowest_off = stream->submit_off;
466 stream = &wreq->io_streams[choose_s];
468 part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
469 stream->submit_len, to_eof);
470 atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
471 stream->submit_off += part;
472 stream->submit_max_len -= part;
473 if (part > stream->submit_len)
474 stream->submit_len = 0;
476 stream->submit_len -= part;
481 atomic64_set(&wreq->issued_to, fpos + fsize);
484 kdebug("R=%x: No submit", wreq->debug_id);
487 for (int s = 0; s < NR_IO_STREAMS; s++)
488 netfs_issue_write(wreq, &wreq->io_streams[s]);
495 * Write some of the pending data back to the server
497 int netfs_writepages(struct address_space *mapping,
498 struct writeback_control *wbc)
500 struct netfs_inode *ictx = netfs_inode(mapping->host);
501 struct netfs_io_request *wreq = NULL;
505 if (wbc->sync_mode == WB_SYNC_ALL)
506 mutex_lock(&ictx->wb_lock);
507 else if (!mutex_trylock(&ictx->wb_lock))
510 /* Need the first folio to be able to set up the op. */
511 folio = writeback_iter(mapping, wbc, NULL, &error);
515 wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
517 error = PTR_ERR(wreq);
521 trace_netfs_write(wreq, netfs_write_trace_writeback);
522 netfs_stat(&netfs_n_wh_writepages);
525 _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
527 /* It appears we don't have to handle cyclic writeback wrapping. */
528 WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
530 if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
531 unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
532 set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
533 wreq->netfs_ops->begin_writeback(wreq);
536 error = netfs_write_folio(wreq, wbc, folio);
539 } while ((folio = writeback_iter(mapping, wbc, folio, &error)));
541 for (int s = 0; s < NR_IO_STREAMS; s++)
542 netfs_issue_write(wreq, &wreq->io_streams[s]);
543 smp_wmb(); /* Write lists before ALL_QUEUED. */
544 set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
546 mutex_unlock(&ictx->wb_lock);
548 netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
549 _leave(" = %d", error);
553 netfs_kill_dirty_pages(mapping, wbc, folio);
555 mutex_unlock(&ictx->wb_lock);
556 _leave(" = %d", error);
559 EXPORT_SYMBOL(netfs_writepages);
562 * Begin a write operation for writing through the pagecache.
564 struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
566 struct netfs_io_request *wreq = NULL;
567 struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
569 mutex_lock(&ictx->wb_lock);
571 wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,
572 iocb->ki_pos, NETFS_WRITETHROUGH);
574 mutex_unlock(&ictx->wb_lock);
578 wreq->io_streams[0].avail = true;
579 trace_netfs_write(wreq, netfs_write_trace_writethrough);
584 * Advance the state of the write operation used when writing through the
585 * pagecache. Data has been copied into the pagecache that we need to append
586 * to the request. If we've added more than wsize then we need to create a new
589 int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
590 struct folio *folio, size_t copied, bool to_page_end,
591 struct folio **writethrough_cache)
593 _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
594 wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
596 if (!*writethrough_cache) {
597 if (folio_test_dirty(folio))
599 folio_clear_dirty_for_io(folio);
601 /* We can make multiple writes to the folio... */
602 folio_start_writeback(folio);
604 trace_netfs_folio(folio, netfs_folio_trace_wthru);
606 trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
607 *writethrough_cache = folio;
614 *writethrough_cache = NULL;
615 return netfs_write_folio(wreq, wbc, folio);
619 * End a write operation used when writing through the pagecache.
621 int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
622 struct folio *writethrough_cache)
624 struct netfs_inode *ictx = netfs_inode(wreq->inode);
627 _enter("R=%x", wreq->debug_id);
629 if (writethrough_cache)
630 netfs_write_folio(wreq, wbc, writethrough_cache);
632 netfs_issue_write(wreq, &wreq->io_streams[0]);
633 netfs_issue_write(wreq, &wreq->io_streams[1]);
634 smp_wmb(); /* Write lists before ALL_QUEUED. */
635 set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
637 mutex_unlock(&ictx->wb_lock);
640 netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
645 * Write data to the server without going through the pagecache and without
646 * writing it to the local cache.
648 int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
650 struct netfs_io_stream *upload = &wreq->io_streams[0];
652 loff_t start = wreq->start;
657 if (wreq->origin == NETFS_DIO_WRITE)
658 inode_dio_begin(wreq->inode);
661 // TODO: Prepare content encryption
663 _debug("unbuffered %zx", len);
664 part = netfs_advance_write(wreq, upload, start, len, false);
667 if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
668 trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
669 wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
671 if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
675 netfs_issue_write(wreq, upload);
677 smp_wmb(); /* Write lists before ALL_QUEUED. */
678 set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
679 if (list_empty(&upload->subrequests))
680 netfs_wake_write_collector(wreq, false);
682 _leave(" = %d", error);