forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 1
/
addr.c
2223 lines (1935 loc) · 59.9 KB
/
addr.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/signal.h>
#include <linux/iversion.h>
#include <linux/ktime.h>
#include <linux/netfs.h>
#include "super.h"
#include "mds_client.h"
#include "cache.h"
#include "metric.h"
#include "crypto.h"
#include <linux/ceph/osd_client.h>
#include <linux/ceph/striper.h>
/*
* Ceph address space ops.
*
* There are a few funny things going on here.
*
* The page->private field is used to reference a struct
* ceph_snap_context for _every_ dirty page. This indicates which
* snapshot the page was logically dirtied in, and thus which snap
* context needs to be associated with the osd write during writeback.
*
* Similarly, struct ceph_inode_info maintains a set of counters to
* count dirty pages on the inode. In the absence of snapshots,
* i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
*
* When a snapshot is taken (that is, when the client receives
* notification that a snapshot was taken), each inode with caps and
* with dirty pages (dirty pages implies there is a cap) gets a new
* ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
* order, new snaps go to the tail). The i_wrbuffer_ref_head count is
* moved to capsnap->dirty. (Unless a sync write is currently in
* progress. In that case, the capsnap is said to be "pending", new
* writes cannot start, and the capsnap isn't "finalized" until the
* write completes (or fails) and a final size/mtime for the inode for
* that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
*
* On writeback, we must submit writes to the osd IN SNAP ORDER. So,
* we look for the first capsnap in i_cap_snaps and write out pages in
* that snap context _only_. Then we move on to the next capsnap,
* eventually reaching the "live" or "head" context (i.e., pages that
* are not yet snapped) and are writing the most recently dirtied
* pages.
*
* Invalidate and so forth must take care to ensure the dirty page
* accounting is preserved.
*/
#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
#define CONGESTION_OFF_THRESH(congestion_kb) \
(CONGESTION_ON_THRESH(congestion_kb) - \
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
struct folio **foliop, void **_fsdata);
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
if (PagePrivate(page))
return (void *)page->private;
return NULL;
}
/*
* Dirty a page. Optimistically adjust accounting, on the assumption
* that we won't race with invalidate. If we do, readjust.
*/
static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
struct inode *inode = mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
if (folio_test_dirty(folio)) {
doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
ceph_vinop(inode), folio, folio->index);
VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
return false;
}
ci = ceph_inode(inode);
/* dirty the head */
spin_lock(&ci->i_ceph_lock);
BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
if (__ceph_have_pending_cap_snap(ci)) {
struct ceph_cap_snap *capsnap =
list_last_entry(&ci->i_cap_snaps,
struct ceph_cap_snap,
ci_item);
snapc = ceph_get_snap_context(capsnap->context);
capsnap->dirty_pages++;
} else {
BUG_ON(!ci->i_head_snapc);
snapc = ceph_get_snap_context(ci->i_head_snapc);
++ci->i_wrbuffer_ref_head;
}
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
"snapc %p seq %lld (%d snaps)\n",
ceph_vinop(inode), folio, folio->index,
ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
* Reference snap context in folio->private. Also set
* PagePrivate so that we get invalidate_folio callback.
*/
VM_WARN_ON_FOLIO(folio->private, folio);
folio_attach_private(folio, snapc);
return ceph_fscache_dirty_folio(mapping, folio);
}
/*
* If we are truncating the full folio (i.e. offset == 0), adjust the
* dirty folio counters appropriately. Only called if there is private
* data on the folio.
*/
static void ceph_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
struct inode *inode = folio->mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
if (offset != 0 || length != folio_size(folio)) {
doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
ceph_vinop(inode), folio->index, offset, length);
return;
}
WARN_ON(!folio_test_locked(folio));
if (folio_test_private(folio)) {
doutc(cl, "%llx.%llx idx %lu full dirty page\n",
ceph_vinop(inode), folio->index);
snapc = folio_detach_private(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
}
netfs_invalidate_folio(folio, offset, length);
}
static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
{
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
unsigned long max_pages = inode->i_sb->s_bdi->ra_pages;
loff_t end = rreq->start + rreq->len, new_end;
struct ceph_netfs_request_data *priv = rreq->netfs_priv;
unsigned long max_len;
u32 blockoff;
if (priv) {
/* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */
if (priv->file_ra_disabled)
max_pages = 0;
else
max_pages = priv->file_ra_pages;
}
/* Readahead is disabled */
if (!max_pages)
return;
max_len = max_pages << PAGE_SHIFT;
/*
* Try to expand the length forward by rounding up it to the next
* block, but do not exceed the file size, unless the original
* request already exceeds it.
*/
new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size);
if (new_end > end && new_end <= rreq->start + max_len)
rreq->len = new_end - rreq->start;
/* Try to expand the start downward */
div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
if (rreq->len + blockoff <= max_len) {
rreq->start -= blockoff;
rreq->len += blockoff;
}
}
static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
u32 xlen;
/* Truncate the extent at the end of the current block */
ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
&objno, &objoff, &xlen);
subreq->len = min(xlen, fsc->mount_options->rsize);
return true;
}
static void finish_netfs_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
struct netfs_io_subrequest *subreq = req->r_priv;
struct ceph_osd_req_op *op = &req->r_ops[0];
int err = req->r_result;
bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, osd_data->length, err);
doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
subreq->len, i_size_read(req->r_inode));
/* no object means success but no data */
if (err == -ENOENT)
err = 0;
else if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
if (err >= 0) {
if (sparse && err > 0)
err = ceph_sparse_ext_map_end(op);
if (err < subreq->len)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (IS_ENCRYPTED(inode) && err > 0) {
err = ceph_fscrypt_decrypt_extents(inode,
osd_data->pages, subreq->start,
op->extent.sparse_ext,
op->extent.sparse_ext_cnt);
if (err > subreq->len)
err = subreq->len;
}
}
if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
ceph_put_page_vector(osd_data->pages,
calc_pages_for(osd_data->alignment,
osd_data->length), false);
}
netfs_subreq_terminated(subreq, err, false);
iput(req->r_inode);
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct inode *inode = rreq->inode;
struct ceph_mds_reply_info_parsed *rinfo;
struct ceph_mds_reply_info_in *iinfo;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct iov_iter iter;
ssize_t err = 0;
size_t len;
int mode;
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
if (subreq->start >= inode->i_size)
goto out;
/* We need to fetch the inline data. */
mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
}
req->r_ino1 = ci->i_vino;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0)
goto out;
rinfo = &req->r_reply_info;
iinfo = &rinfo->targeti;
if (iinfo->inline_version == CEPH_INLINE_NONE) {
/* The data got uninlined */
ceph_mdsc_put_request(req);
return false;
}
len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
if (err == 0)
err = -EFAULT;
ceph_mdsc_put_request(req);
out:
netfs_subreq_terminated(subreq, err, false);
return true;
}
static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
int err = 0;
u64 len = subreq->len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
int extent_cnt;
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
goto out;
}
if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;
ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = NULL;
goto out;
}
if (sparse) {
extent_cnt = __ceph_sparse_read_ext_count(inode, len);
err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
if (err)
goto out;
}
doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
ceph_vinop(inode), subreq->start, subreq->len, len);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
/*
* FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
* encrypted inodes. We'd need infrastructure that handles an iov_iter
* instead of page arrays, and we don't have that as of yet. Once the
* dust settles on the write helpers and encrypt/decrypt routines for
* netfs, we should be able to rework this.
*/
if (IS_ENCRYPTED(inode)) {
struct page **pages;
size_t page_off;
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
ceph_vinop(inode), err);
goto out;
}
/* should always give us a page-aligned read */
WARN_ON_ONCE(page_off);
len = err;
err = 0;
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
false);
} else {
osd_req_op_extent_osd_iter(req, 0, &iter);
}
if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
err = -EIO;
goto out;
}
req->r_callback = finish_netfs_read;
req->r_priv = subreq;
req->r_inode = inode;
ihold(inode);
ceph_osdc_start_request(req->r_osdc, req);
out:
ceph_osdc_put_request(req);
if (err)
netfs_subreq_terminated(subreq, err, false);
doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}
static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct inode *inode = rreq->inode;
struct ceph_client *cl = ceph_inode_to_client(inode);
int got = 0, want = CEPH_CAP_FILE_CACHE;
struct ceph_netfs_request_data *priv;
int ret = 0;
if (rreq->origin != NETFS_READAHEAD)
return 0;
priv = kzalloc(sizeof(*priv), GFP_NOFS);
if (!priv)
return -ENOMEM;
if (file) {
struct ceph_rw_context *rw_ctx;
struct ceph_file_info *fi = file->private_data;
priv->file_ra_pages = file->f_ra.ra_pages;
priv->file_ra_disabled = file->f_mode & FMODE_RANDOM;
rw_ctx = ceph_find_rw_context(fi);
if (rw_ctx) {
rreq->netfs_priv = priv;
return 0;
}
}
/*
* readahead callers do not necessarily hold Fcb caps
* (e.g. fadvise, madvise).
*/
ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
if (ret < 0) {
doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
goto out;
}
if (!(got & want)) {
doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
ret = -EACCES;
goto out;
}
if (ret == 0) {
ret = -EACCES;
goto out;
}
priv->caps = got;
rreq->netfs_priv = priv;
out:
if (ret < 0)
kfree(priv);
return ret;
}
static void ceph_netfs_free_request(struct netfs_io_request *rreq)
{
struct ceph_netfs_request_data *priv = rreq->netfs_priv;
if (!priv)
return;
if (priv->caps)
ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps);
kfree(priv);
rreq->netfs_priv = NULL;
}
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
};
#ifdef CONFIG_CEPH_FSCACHE
static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
{
struct inode *inode = priv;
if (IS_ERR_VALUE(error) && error != -ENOBUFS)
ceph_fscache_invalidate(inode, false);
}
static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
ceph_fscache_write_terminated, inode, true, caching);
}
#else
static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
{
}
#endif /* CONFIG_CEPH_FSCACHE */
struct ceph_writeback_ctl
{
loff_t i_size;
u64 truncate_size;
u32 truncate_seq;
bool size_stable;
bool head_snapc;
};
/*
* Get ref for the oldest snapc for an inode with dirty data... that is, the
* only snap context we are allowed to write back.
*/
static struct ceph_snap_context *
get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
struct ceph_snap_context *page_snapc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc = NULL;
struct ceph_cap_snap *capsnap = NULL;
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
capsnap, capsnap->context, capsnap->dirty_pages);
if (!capsnap->dirty_pages)
continue;
/* get i_size, truncate_{seq,size} for page_snapc? */
if (snapc && capsnap->context != page_snapc)
continue;
if (ctl) {
if (capsnap->writing) {
ctl->i_size = i_size_read(inode);
ctl->size_stable = false;
} else {
ctl->i_size = capsnap->size;
ctl->size_stable = true;
}
ctl->truncate_size = capsnap->truncate_size;
ctl->truncate_seq = capsnap->truncate_seq;
ctl->head_snapc = false;
}
if (snapc)
break;
snapc = ceph_get_snap_context(capsnap->context);
if (!page_snapc ||
page_snapc == snapc ||
page_snapc->seq > snapc->seq)
break;
}
if (!snapc && ci->i_wrbuffer_ref_head) {
snapc = ceph_get_snap_context(ci->i_head_snapc);
doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
ci->i_wrbuffer_ref_head);
if (ctl) {
ctl->i_size = i_size_read(inode);
ctl->truncate_size = ci->i_truncate_size;
ctl->truncate_seq = ci->i_truncate_seq;
ctl->size_stable = false;
ctl->head_snapc = true;
}
}
spin_unlock(&ci->i_ceph_lock);
return snapc;
}
static u64 get_writepages_data_length(struct inode *inode,
struct page *page, u64 start)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
struct ceph_cap_snap *capsnap = NULL;
u64 end = i_size_read(inode);
u64 ret;
snapc = page_snap_context(ceph_fscrypt_pagecache_page(page));
if (snapc != ci->i_head_snapc) {
bool found = false;
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->context == snapc) {
if (!capsnap->writing)
end = capsnap->size;
found = true;
break;
}
}
spin_unlock(&ci->i_ceph_lock);
WARN_ON(!found);
}
if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
end = ceph_fscrypt_page_offset(page) + thp_size(page);
ret = end > start ? end - start : 0;
if (ret && fscrypt_is_bounce_page(page))
ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
return ret;
}
/*
* Write a single page, but leave the page locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
* dirty page accounting (i.e., page is no longer dirty).
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
int err;
loff_t len = thp_size(page);
loff_t wlen;
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
bool caching = ceph_is_cache_enabled(inode);
struct page *bounce_page = NULL;
doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
page->index);
if (ceph_inode_is_shutdown(inode))
return -EIO;
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
page);
return 0;
}
oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
ceph_vinop(inode), page, snapc);
/* we should only noop if called by kswapd */
WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
redirty_page_for_writepage(wbc, page);
return 0;
}
ceph_put_snap_context(oldest);
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
ceph_vinop(inode), folio->index, ceph_wbc.i_size);
folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
fsc->write_congested = true;
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size, true);
if (IS_ERR(req)) {
redirty_page_for_writepage(wbc, page);
return PTR_ERR(req);
}
if (wlen < len)
len = wlen;
set_page_writeback(page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) {
bounce_page = fscrypt_encrypt_pagecache_blocks(page,
CEPH_FSCRYPT_BLOCK_SIZE, 0,
GFP_NOFS);
if (IS_ERR(bounce_page)) {
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
ceph_osdc_put_request(req);
return PTR_ERR(bounce_page);
}
}
/* it may be a short write due to an object boundary */
WARN_ON_ONCE(len > thp_size(page));
osd_req_op_extent_osd_data_pages(req, 0,
bounce_page ? &bounce_page : &page, wlen, 0,
false, false);
doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
ceph_vinop(inode), page_off, len, wlen,
IS_ENCRYPTED(inode) ? "" : "not ");
req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(osdc, req);
err = ceph_osdc_wait_request(osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
fscrypt_free_bounce_page(bounce_page);
ceph_osdc_put_request(req);
if (err == 0)
err = len;
if (err < 0) {
struct writeback_control tmp_wbc;
if (!wbc)
wbc = &tmp_wbc;
if (err == -ERESTARTSYS) {
/* killed by SIGKILL */
doutc(cl, "%llx.%llx interrupted page %p\n",
ceph_vinop(inode), page);
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
return err;
}
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
ceph_vinop(inode), err, page);
mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++;
} else {
doutc(cl, "%llx.%llx cleaned page %p\n",
ceph_vinop(inode), page);
err = 0; /* vfs expects us to return 0 */
}
oldest = detach_page_private(page);
WARN_ON_ONCE(oldest != snapc);
end_page_writeback(page);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
fsc->write_congested = false;
return err;
}
static int ceph_writepage(struct page *page, struct writeback_control *wbc)
{
int err;
struct inode *inode = page->mapping->host;
BUG_ON(!inode);
ihold(inode);
if (wbc->sync_mode == WB_SYNC_NONE &&
ceph_inode_to_fs_client(inode)->write_congested) {
redirty_page_for_writepage(wbc, page);
return AOP_WRITEPAGE_ACTIVATE;
}
err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
/* direct memory reclaimer was killed by SIGKILL. return 0
* to prevent caller from setting mapping/page error */
err = 0;
}
unlock_page(page);
iput(inode);
return err;
}
/*
* async writeback completion handler.
*
* If we get an error, set the mapping error bit, but not the individual
* page error bits.
*/
static void writepages_finish(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_osd_data *osd_data;
struct page *page;
int num_pages, total_pages = 0;
int i, j;
int rc = req->r_result;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
unsigned int len = 0;
bool remove_page;
doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
if (rc == -EBLOCKLISTED)
fsc->blocklisted = true;
} else {
ceph_clear_error_write(ci);
}
/*
* We lost the cache cap, need to truncate the page before
* it is unlocked, otherwise we'd truncate it later in the
* page truncation thread, possibly losing some data that
* raced its way in
*/
remove_page = !(ceph_caps_issued(ci) &
(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
pr_warn_client(cl,
"%llx.%llx incorrect op %d req %p index %d tid %llu\n",
ceph_vinop(inode), req->r_ops[i].op, req, i,
req->r_tid);
break;
}
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
len += osd_data->length;
num_pages = calc_pages_for((u64)osd_data->alignment,
(u64)osd_data->length);
total_pages += num_pages;
for (j = 0; j < num_pages; j++) {
page = osd_data->pages[j];
if (fscrypt_is_bounce_page(page)) {
page = fscrypt_pagecache_page(page);
fscrypt_free_bounce_page(osd_data->pages[j]);
osd_data->pages[j] = page;
}
BUG_ON(!page);
WARN_ON(!PageUptodate(page));
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
fsc->write_congested = false;
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
doutc(cl, "unlocking %p\n", page);
if (remove_page)
generic_error_remove_folio(inode->i_mapping,
page_folio(page));
unlock_page(page);
}
doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
ceph_vinop(inode), osd_data->length,
rc >= 0 ? num_pages : 0);
release_pages(osd_data->pages, num_pages);
}
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, rc);
ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool)
mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
else
kfree(osd_data->pages);
ceph_osdc_put_request(req);
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
/*
* initiate async writeback
*/
static int ceph_writepages_start(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
struct ceph_vino vino = ceph_vino(inode);
pgoff_t index, start_index, end = -1;
struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
struct folio_batch fbatch;
int rc = 0;
unsigned int wsize = i_blocksize(inode);
struct ceph_osd_request *req = NULL;
struct ceph_writeback_ctl ceph_wbc;
bool should_loop, range_whole = false;
bool done = false;
bool caching = ceph_is_cache_enabled(inode);
xa_mark_t tag;
if (wbc->sync_mode == WB_SYNC_NONE &&
fsc->write_congested)
return 0;
doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
pr_warn_ratelimited_client(cl,
"%llx.%llx %lld forced umount\n",
ceph_vinop(inode), ceph_ino(inode));
}
mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
}
if (fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
folio_batch_init(&fbatch);
start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
index = start_index;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
tag = PAGECACHE_TAG_TOWRITE;
} else {
tag = PAGECACHE_TAG_DIRTY;
}
retry:
/* find oldest snap context with dirty data */
snapc = get_oldest_context(inode, &ceph_wbc, NULL);
if (!snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
doutc(cl, " no snap context with dirty data?\n");
goto out;
}
doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
snapc->seq, snapc->num_snaps);
should_loop = false;
if (ceph_wbc.head_snapc && snapc != last_snapc) {
/* where to start/end? */
if (wbc->range_cyclic) {
index = start_index;
end = -1;
if (index > 0)
should_loop = true;
doutc(cl, " cyclic, start at %lu\n", index);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = true;
doutc(cl, " not cyclic, %lu to %lu\n", index, end);
}
} else if (!ceph_wbc.head_snapc) {
/* Do not respect wbc->range_{start,end}. Dirty pages
* in that range can be associated with newer snapc.
* They are not writeable until we write all dirty pages
* associated with 'snapc' get written */
if (index > 0)
should_loop = true;
doutc(cl, " non-head snapc, range whole\n");
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
ceph_put_snap_context(last_snapc);