forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
/
aops.c
1638 lines (1563 loc) · 48 KB
/
aops.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* aops.c - NTFS kernel address space operations and page cache handling.
* Part of the Linux-NTFS project.
*
* Copyright (c) 2001-2007 Anton Altaparmakov
* Copyright (c) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program/include file is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program (in the main directory of the Linux-NTFS
* distribution in the file COPYING); if not, write to the Free Software
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/bit_spinlock.h>
#include "aops.h"
#include "attrib.h"
#include "debug.h"
#include "inode.h"
#include "mft.h"
#include "runlist.h"
#include "types.h"
#include "ntfs.h"
/**
* ntfs_end_buffer_async_read - async io completion for reading attributes
* @bh: buffer head on which io is completed
* @uptodate: whether @bh is now uptodate or not
*
* Asynchronous I/O completion handler for reading pages belonging to the
* attribute address space of an inode. The inodes can either be files or
* directories or they can be fake inodes describing some attribute.
*
* If NInoMstProtected(), perform the post read mst fixups when all IO on the
* page has been completed and mark the page uptodate or set the error bit on
* the page. To determine the size of the records that need fixing up, we
* cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
* record size, and index_block_size_bits, to the log(base 2) of the ntfs
* record size.
*/
static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
struct buffer_head *first, *tmp;
struct page *page;
struct inode *vi;
ntfs_inode *ni;
int page_uptodate = 1;
page = bh->b_page;
vi = page->mapping->host;
ni = NTFS_I(vi);
if (likely(uptodate)) {
loff_t i_size;
s64 file_ofs, init_size;
set_buffer_uptodate(bh);
file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
bh_offset(bh);
read_lock_irqsave(&ni->size_lock, flags);
init_size = ni->initialized_size;
i_size = i_size_read(vi);
read_unlock_irqrestore(&ni->size_lock, flags);
if (unlikely(init_size > i_size)) {
/* Race with shrinking truncate. */
init_size = i_size;
}
/* Check for the current buffer head overflowing. */
if (unlikely(file_ofs + bh->b_size > init_size)) {
int ofs;
void *kaddr;
ofs = 0;
if (file_ofs < init_size)
ofs = init_size - file_ofs;
local_irq_save(flags);
kaddr = kmap_atomic(page);
memset(kaddr + bh_offset(bh) + ofs, 0,
bh->b_size - ofs);
flush_dcache_page(page);
kunmap_atomic(kaddr);
local_irq_restore(flags);
}
} else {
clear_buffer_uptodate(bh);
SetPageError(page);
ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
"0x%llx.", (unsigned long long)bh->b_blocknr);
}
first = page_buffers(page);
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
do {
if (!buffer_uptodate(tmp))
page_uptodate = 0;
if (buffer_async_read(tmp)) {
if (likely(buffer_locked(tmp)))
goto still_busy;
/* Async buffers must be locked. */
BUG();
}
tmp = tmp->b_this_page;
} while (tmp != bh);
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
/*
* If none of the buffers had errors then we can set the page uptodate,
* but we first have to perform the post read mst fixups, if the
* attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
* Note we ignore fixup errors as those are detected when
* map_mft_record() is called which gives us per record granularity
* rather than per page granularity.
*/
if (!NInoMstProtected(ni)) {
if (likely(page_uptodate && !PageError(page)))
SetPageUptodate(page);
} else {
u8 *kaddr;
unsigned int i, recs;
u32 rec_size;
rec_size = ni->itype.index.block_size;
recs = PAGE_CACHE_SIZE / rec_size;
/* Should have been verified before we got here... */
BUG_ON(!recs);
local_irq_save(flags);
kaddr = kmap_atomic(page);
for (i = 0; i < recs; i++)
post_read_mst_fixup((NTFS_RECORD*)(kaddr +
i * rec_size), rec_size);
kunmap_atomic(kaddr);
local_irq_restore(flags);
flush_dcache_page(page);
if (likely(page_uptodate && !PageError(page)))
SetPageUptodate(page);
}
unlock_page(page);
return;
still_busy:
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
return;
}
/**
* ntfs_read_block - fill a @page of an address space with data
* @page: page cache page to fill with data
*
* Fill the page @page of the address space belonging to the @page->host inode.
* We read each buffer asynchronously and when all buffers are read in, our io
* completion handler ntfs_end_buffer_read_async(), if required, automatically
* applies the mst fixups to the page before finally marking it uptodate and
* unlocking it.
*
* We only enforce allocated_size limit because i_size is checked for in
* generic_file_read().
*
* Return 0 on success and -errno on error.
*
* Contains an adapted version of fs/buffer.c::block_read_full_page().
*/
static int ntfs_read_block(struct page *page)
{
loff_t i_size;
VCN vcn;
LCN lcn;
s64 init_size;
struct inode *vi;
ntfs_inode *ni;
ntfs_volume *vol;
runlist_element *rl;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
sector_t iblock, lblock, zblock;
unsigned long flags;
unsigned int blocksize, vcn_ofs;
int i, nr;
unsigned char blocksize_bits;
vi = page->mapping->host;
ni = NTFS_I(vi);
vol = ni->vol;
/* $MFT/$DATA must have its complete runlist in memory at all times. */
BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
if (!page_has_buffers(page)) {
create_empty_buffers(page, blocksize, 0);
if (unlikely(!page_has_buffers(page))) {
unlock_page(page);
return -ENOMEM;
}
}
bh = head = page_buffers(page);
BUG_ON(!bh);
/*
* We may be racing with truncate. To avoid some of the problems we
* now take a snapshot of the various sizes and use those for the whole
* of the function. In case of an extending truncate it just means we
* may leave some buffers unmapped which are now allocated. This is
* not a problem since these buffers will just get mapped when a write
* occurs. In case of a shrinking truncate, we will detect this later
* on due to the runlist being incomplete and if the page is being
* fully truncated, truncate will throw it away as soon as we unlock
* it so no need to worry what we do with it.
*/
iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
init_size = ni->initialized_size;
i_size = i_size_read(vi);
read_unlock_irqrestore(&ni->size_lock, flags);
if (unlikely(init_size > i_size)) {
/* Race with shrinking truncate. */
init_size = i_size;
}
zblock = (init_size + blocksize - 1) >> blocksize_bits;
/* Loop through all the buffers in the page. */
rl = NULL;
nr = i = 0;
do {
int err = 0;
if (unlikely(buffer_uptodate(bh)))
continue;
if (unlikely(buffer_mapped(bh))) {
arr[nr++] = bh;
continue;
}
bh->b_bdev = vol->sb->s_bdev;
/* Is the block within the allowed limits? */
if (iblock < lblock) {
bool is_retry = false;
/* Convert iblock into corresponding vcn and offset. */
vcn = (VCN)iblock << blocksize_bits >>
vol->cluster_size_bits;
vcn_ofs = ((VCN)iblock << blocksize_bits) &
vol->cluster_size_mask;
if (!rl) {
lock_retry_remap:
down_read(&ni->runlist.lock);
rl = ni->runlist.rl;
}
if (likely(rl != NULL)) {
/* Seek to element containing target vcn. */
while (rl->length && rl[1].vcn <= vcn)
rl++;
lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
} else
lcn = LCN_RL_NOT_MAPPED;
/* Successful remap. */
if (lcn >= 0) {
/* Setup buffer head to correct block. */
bh->b_blocknr = ((lcn << vol->cluster_size_bits)
+ vcn_ofs) >> blocksize_bits;
set_buffer_mapped(bh);
/* Only read initialized data blocks. */
if (iblock < zblock) {
arr[nr++] = bh;
continue;
}
/* Fully non-initialized data block, zero it. */
goto handle_zblock;
}
/* It is a hole, need to zero it. */
if (lcn == LCN_HOLE)
goto handle_hole;
/* If first try and runlist unmapped, map and retry. */
if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
is_retry = true;
/*
* Attempt to map runlist, dropping lock for
* the duration.
*/
up_read(&ni->runlist.lock);
err = ntfs_map_runlist(ni, vcn);
if (likely(!err))
goto lock_retry_remap;
rl = NULL;
} else if (!rl)
up_read(&ni->runlist.lock);
/*
* If buffer is outside the runlist, treat it as a
* hole. This can happen due to concurrent truncate
* for example.
*/
if (err == -ENOENT || lcn == LCN_ENOENT) {
err = 0;
goto handle_hole;
}
/* Hard error, zero out region. */
if (!err)
err = -EIO;
bh->b_blocknr = -1;
SetPageError(page);
ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
"attribute type 0x%x, vcn 0x%llx, "
"offset 0x%x because its location on "
"disk could not be determined%s "
"(error code %i).", ni->mft_no,
ni->type, (unsigned long long)vcn,
vcn_ofs, is_retry ? " even after "
"retrying" : "", err);
}
/*
* Either iblock was outside lblock limits or
* ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
* of the page and set the buffer uptodate.
*/
handle_hole:
bh->b_blocknr = -1UL;
clear_buffer_mapped(bh);
handle_zblock:
zero_user(page, i * blocksize, blocksize);
if (likely(!err))
set_buffer_uptodate(bh);
} while (i++, iblock++, (bh = bh->b_this_page) != head);
/* Release the lock if we took it. */
if (rl)
up_read(&ni->runlist.lock);
/* Check we have at least one buffer ready for i/o. */
if (nr) {
struct buffer_head *tbh;
/* Lock the buffers. */
for (i = 0; i < nr; i++) {
tbh = arr[i];
lock_buffer(tbh);
tbh->b_end_io = ntfs_end_buffer_async_read;
set_buffer_async_read(tbh);
}
/* Finally, start i/o on the buffers. */
for (i = 0; i < nr; i++) {
tbh = arr[i];
if (likely(!buffer_uptodate(tbh)))
submit_bh(READ, tbh);
else
ntfs_end_buffer_async_read(tbh, 1);
}
return 0;
}
/* No i/o was scheduled on any of the buffers. */
if (likely(!PageError(page)))
SetPageUptodate(page);
else /* Signal synchronous i/o error. */
nr = -EIO;
unlock_page(page);
return nr;
}
/**
* ntfs_readpage - fill a @page of a @file with data from the device
* @file: open file to which the page @page belongs or NULL
* @page: page cache page to fill with data
*
* For non-resident attributes, ntfs_readpage() fills the @page of the open
* file @file by calling the ntfs version of the generic block_read_full_page()
* function, ntfs_read_block(), which in turn creates and reads in the buffers
* associated with the page asynchronously.
*
* For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
* data from the mft record (which at this stage is most likely in memory) and
* fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
* even if the mft record is not cached at this point in time, we need to wait
* for it to be read in before we can do the copy.
*
* Return 0 on success and -errno on error.
*/
static int ntfs_readpage(struct file *file, struct page *page)
{
loff_t i_size;
struct inode *vi;
ntfs_inode *ni, *base_ni;
u8 *addr;
ntfs_attr_search_ctx *ctx;
MFT_RECORD *mrec;
unsigned long flags;
u32 attr_len;
int err = 0;
retry_readpage:
BUG_ON(!PageLocked(page));
vi = page->mapping->host;
i_size = i_size_read(vi);
/* Is the page fully outside i_size? (truncate in progress) */
if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT)) {
zero_user(page, 0, PAGE_CACHE_SIZE);
ntfs_debug("Read outside i_size - truncated?");
goto done;
}
/*
* This can potentially happen because we clear PageUptodate() during
* ntfs_writepage() of MstProtected() attributes.
*/
if (PageUptodate(page)) {
unlock_page(page);
return 0;
}
ni = NTFS_I(vi);
/*
* Only $DATA attributes can be encrypted and only unnamed $DATA
* attributes can be compressed. Index root can have the flags set but
* this means to create compressed/encrypted files, not that the
* attribute is compressed/encrypted. Note we need to check for
* AT_INDEX_ALLOCATION since this is the type of both directory and
* index inodes.
*/
if (ni->type != AT_INDEX_ALLOCATION) {
/* If attribute is encrypted, deny access, just like NT4. */
if (NInoEncrypted(ni)) {
BUG_ON(ni->type != AT_DATA);
err = -EACCES;
goto err_out;
}
/* Compressed data streams are handled in compress.c. */
if (NInoNonResident(ni) && NInoCompressed(ni)) {
BUG_ON(ni->type != AT_DATA);
BUG_ON(ni->name_len);
return ntfs_read_compressed_block(page);
}
}
/* NInoNonResident() == NInoIndexAllocPresent() */
if (NInoNonResident(ni)) {
/* Normal, non-resident data stream. */
return ntfs_read_block(page);
}
/*
* Attribute is resident, implying it is not compressed or encrypted.
* This also means the attribute is smaller than an mft record and
* hence smaller than a page, so can simply zero out any pages with
* index above 0. Note the attribute can actually be marked compressed
* but if it is resident the actual data is not compressed so we are
* ok to ignore the compressed flag here.
*/
if (unlikely(page->index > 0)) {
zero_user(page, 0, PAGE_CACHE_SIZE);
goto done;
}
if (!NInoAttr(ni))
base_ni = ni;
else
base_ni = ni->ext.base_ntfs_ino;
/* Map, pin, and lock the mft record. */
mrec = map_mft_record(base_ni);
if (IS_ERR(mrec)) {
err = PTR_ERR(mrec);
goto err_out;
}
/*
* If a parallel write made the attribute non-resident, drop the mft
* record and retry the readpage.
*/
if (unlikely(NInoNonResident(ni))) {
unmap_mft_record(base_ni);
goto retry_readpage;
}
ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
if (unlikely(!ctx)) {
err = -ENOMEM;
goto unm_err_out;
}
err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
CASE_SENSITIVE, 0, NULL, 0, ctx);
if (unlikely(err))
goto put_unm_err_out;
attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
read_lock_irqsave(&ni->size_lock, flags);
if (unlikely(attr_len > ni->initialized_size))
attr_len = ni->initialized_size;
i_size = i_size_read(vi);
read_unlock_irqrestore(&ni->size_lock, flags);
if (unlikely(attr_len > i_size)) {
/* Race with shrinking truncate. */
attr_len = i_size;
}
addr = kmap_atomic(page);
/* Copy the data to the page. */
memcpy(addr, (u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset),
attr_len);
/* Zero the remainder of the page. */
memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
flush_dcache_page(page);
kunmap_atomic(addr);
put_unm_err_out:
ntfs_attr_put_search_ctx(ctx);
unm_err_out:
unmap_mft_record(base_ni);
done:
SetPageUptodate(page);
err_out:
unlock_page(page);
return err;
}
#ifdef NTFS_RW
/**
* ntfs_write_block - write a @page to the backing store
* @page: page cache page to write out
* @wbc: writeback control structure
*
* This function is for writing pages belonging to non-resident, non-mst
* protected attributes to their backing store.
*
* For a page with buffers, map and write the dirty buffers asynchronously
* under page writeback. For a page without buffers, create buffers for the
* page, then proceed as above.
*
* If a page doesn't have buffers the page dirty state is definitive. If a page
* does have buffers, the page dirty state is just a hint, and the buffer dirty
* state is definitive. (A hint which has rules: dirty buffers against a clean
* page is illegal. Other combinations are legal and need to be handled. In
* particular a dirty page containing clean buffers for example.)
*
* Return 0 on success and -errno on error.
*
* Based on ntfs_read_block() and __block_write_full_page().
*/
static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
{
VCN vcn;
LCN lcn;
s64 initialized_size;
loff_t i_size;
sector_t block, dblock, iblock;
struct inode *vi;
ntfs_inode *ni;
ntfs_volume *vol;
runlist_element *rl;
struct buffer_head *bh, *head;
unsigned long flags;
unsigned int blocksize, vcn_ofs;
int err;
bool need_end_writeback;
unsigned char blocksize_bits;
vi = page->mapping->host;
ni = NTFS_I(vi);
vol = ni->vol;
ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
"0x%lx.", ni->mft_no, ni->type, page->index);
BUG_ON(!NInoNonResident(ni));
BUG_ON(NInoMstProtected(ni));
blocksize = vol->sb->s_blocksize;
blocksize_bits = vol->sb->s_blocksize_bits;
if (!page_has_buffers(page)) {
BUG_ON(!PageUptodate(page));
create_empty_buffers(page, blocksize,
(1 << BH_Uptodate) | (1 << BH_Dirty));
if (unlikely(!page_has_buffers(page))) {
ntfs_warning(vol->sb, "Error allocating page "
"buffers. Redirtying page so we try "
"again later.");
/*
* Put the page back on mapping->dirty_pages, but leave
* its buffers' dirty state as-is.
*/
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
}
bh = head = page_buffers(page);
BUG_ON(!bh);
/* NOTE: Different naming scheme to ntfs_read_block()! */
/* The first block in the page. */
block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
i_size = i_size_read(vi);
initialized_size = ni->initialized_size;
read_unlock_irqrestore(&ni->size_lock, flags);
/* The first out of bounds block for the data size. */
dblock = (i_size + blocksize - 1) >> blocksize_bits;
/* The last (fully or partially) initialized block. */
iblock = initialized_size >> blocksize_bits;
/*
* Be very careful. We have no exclusion from __set_page_dirty_buffers
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
* handle that here by just cleaning them.
*/
/*
* Loop through all the buffers in the page, mapping all the dirty
* buffers to disk addresses and handling any aliases from the
* underlying block device's mapping.
*/
rl = NULL;
err = 0;
do {
bool is_retry = false;
if (unlikely(block >= dblock)) {
/*
* Mapped buffers outside i_size will occur, because
* this page can be outside i_size when there is a
* truncate in progress. The contents of such buffers
* were zeroed by ntfs_writepage().
*
* FIXME: What about the small race window where
* ntfs_writepage() has not done any clearing because
* the page was within i_size but before we get here,
* vmtruncate() modifies i_size?
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
}
/* Clean buffers are not written out, so no need to map them. */
if (!buffer_dirty(bh))
continue;
/* Make sure we have enough initialized size. */
if (unlikely((block >= iblock) &&
(initialized_size < i_size))) {
/*
* If this page is fully outside initialized size, zero
* out all pages between the current initialized size
* and the current page. Just use ntfs_readpage() to do
* the zeroing transparently.
*/
if (block > iblock) {
// TODO:
// For each page do:
// - read_cache_page()
// Again for each page do:
// - wait_on_page_locked()
// - Check (PageUptodate(page) &&
// !PageError(page))
// Update initialized size in the attribute and
// in the inode.
// Again, for each page do:
// __set_page_dirty_buffers();
// page_cache_release()
// We don't need to wait on the writes.
// Update iblock.
}
/*
* The current page straddles initialized size. Zero
* all non-uptodate buffers and set them uptodate (and
* dirty?). Note, there aren't any non-uptodate buffers
* if the page is uptodate.
* FIXME: For an uptodate page, the buffers may need to
* be written out because they were not initialized on
* disk before.
*/
if (!PageUptodate(page)) {
// TODO:
// Zero any non-uptodate buffers up to i_size.
// Set them uptodate and dirty.
}
// TODO:
// Update initialized size in the attribute and in the
// inode (up to i_size).
// Update iblock.
// FIXME: This is inefficient. Try to batch the two
// size changes to happen in one go.
ntfs_error(vol->sb, "Writing beyond initialized size "
"is not supported yet. Sorry.");
err = -EOPNOTSUPP;
break;
// Do NOT set_buffer_new() BUT DO clear buffer range
// outside write request range.
// set_buffer_uptodate() on complete buffers as well as
// set_buffer_dirty().
}
/* No need to map buffers that are already mapped. */
if (buffer_mapped(bh))
continue;
/* Unmapped, dirty buffer. Need to map it. */
bh->b_bdev = vol->sb->s_bdev;
/* Convert block into corresponding vcn and offset. */
vcn = (VCN)block << blocksize_bits;
vcn_ofs = vcn & vol->cluster_size_mask;
vcn >>= vol->cluster_size_bits;
if (!rl) {
lock_retry_remap:
down_read(&ni->runlist.lock);
rl = ni->runlist.rl;
}
if (likely(rl != NULL)) {
/* Seek to element containing target vcn. */
while (rl->length && rl[1].vcn <= vcn)
rl++;
lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
} else
lcn = LCN_RL_NOT_MAPPED;
/* Successful remap. */
if (lcn >= 0) {
/* Setup buffer head to point to correct block. */
bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
vcn_ofs) >> blocksize_bits;
set_buffer_mapped(bh);
continue;
}
/* It is a hole, need to instantiate it. */
if (lcn == LCN_HOLE) {
u8 *kaddr;
unsigned long *bpos, *bend;
/* Check if the buffer is zero. */
kaddr = kmap_atomic(page);
bpos = (unsigned long *)(kaddr + bh_offset(bh));
bend = (unsigned long *)((u8*)bpos + blocksize);
do {
if (unlikely(*bpos))
break;
} while (likely(++bpos < bend));
kunmap_atomic(kaddr);
if (bpos == bend) {
/*
* Buffer is zero and sparse, no need to write
* it.
*/
bh->b_blocknr = -1;
clear_buffer_dirty(bh);
continue;
}
// TODO: Instantiate the hole.
// clear_buffer_new(bh);
// unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
ntfs_error(vol->sb, "Writing into sparse regions is "
"not supported yet. Sorry.");
err = -EOPNOTSUPP;
break;
}
/* If first try and runlist unmapped, map and retry. */
if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
is_retry = true;
/*
* Attempt to map runlist, dropping lock for
* the duration.
*/
up_read(&ni->runlist.lock);
err = ntfs_map_runlist(ni, vcn);
if (likely(!err))
goto lock_retry_remap;
rl = NULL;
} else if (!rl)
up_read(&ni->runlist.lock);
/*
* If buffer is outside the runlist, truncate has cut it out
* of the runlist. Just clean and clear the buffer and set it
* uptodate so it can get discarded by the VM.
*/
if (err == -ENOENT || lcn == LCN_ENOENT) {
bh->b_blocknr = -1;
clear_buffer_dirty(bh);
zero_user(page, bh_offset(bh), blocksize);
set_buffer_uptodate(bh);
err = 0;
continue;
}
/* Failed to map the buffer, even after retrying. */
if (!err)
err = -EIO;
bh->b_blocknr = -1;
ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
"because its location on disk could not be "
"determined%s (error code %i).", ni->mft_no,
ni->type, (unsigned long long)vcn,
vcn_ofs, is_retry ? " even after "
"retrying" : "", err);
break;
} while (block++, (bh = bh->b_this_page) != head);
/* Release the lock if we took it. */
if (rl)
up_read(&ni->runlist.lock);
/* For the error case, need to reset bh to the beginning. */
bh = head;
/* Just an optimization, so ->readpage() is not called later. */
if (unlikely(!PageUptodate(page))) {
int uptodate = 1;
do {
if (!buffer_uptodate(bh)) {
uptodate = 0;
bh = head;
break;
}
} while ((bh = bh->b_this_page) != head);
if (uptodate)
SetPageUptodate(page);
}
/* Setup all mapped, dirty buffers for async write i/o. */
do {
if (buffer_mapped(bh) && buffer_dirty(bh)) {
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
BUG_ON(!buffer_uptodate(bh));
mark_buffer_async_write(bh);
} else
unlock_buffer(bh);
} else if (unlikely(err)) {
/*
* For the error case. The buffer may have been set
* dirty during attachment to a dirty page.
*/
if (err != -ENOMEM)
clear_buffer_dirty(bh);
}
} while ((bh = bh->b_this_page) != head);
if (unlikely(err)) {
// TODO: Remove the -EOPNOTSUPP check later on...
if (unlikely(err == -EOPNOTSUPP))
err = 0;
else if (err == -ENOMEM) {
ntfs_warning(vol->sb, "Error allocating memory. "
"Redirtying page so we try again "
"later.");
/*
* Put the page back on mapping->dirty_pages, but
* leave its buffer's dirty state as-is.
*/
redirty_page_for_writepage(wbc, page);
err = 0;
} else
SetPageError(page);
}
BUG_ON(PageWriteback(page));
set_page_writeback(page); /* Keeps try_to_free_buffers() away. */
/* Submit the prepared buffers for i/o. */
need_end_writeback = true;
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
submit_bh(WRITE, bh);
need_end_writeback = false;
}
bh = next;
} while (bh != head);
unlock_page(page);
/* If no i/o was started, need to end_page_writeback(). */
if (unlikely(need_end_writeback))
end_page_writeback(page);
ntfs_debug("Done.");
return err;
}
/**
* ntfs_write_mst_block - write a @page to the backing store
* @page: page cache page to write out
* @wbc: writeback control structure
*
* This function is for writing pages belonging to non-resident, mst protected
* attributes to their backing store. The only supported attributes are index
* allocation and $MFT/$DATA. Both directory inodes and index inodes are
* supported for the index allocation case.
*
* The page must remain locked for the duration of the write because we apply
* the mst fixups, write, and then undo the fixups, so if we were to unlock the
* page before undoing the fixups, any other user of the page will see the
* page contents as corrupt.
*
* We clear the page uptodate flag for the duration of the function to ensure
* exclusion for the $MFT/$DATA case against someone mapping an mft record we
* are about to apply the mst fixups to.
*
* Return 0 on success and -errno on error.
*
* Based on ntfs_write_block(), ntfs_mft_writepage(), and
* write_mft_record_nolock().
*/
static int ntfs_write_mst_block(struct page *page,
struct writeback_control *wbc)
{
sector_t block, dblock, rec_block;
struct inode *vi = page->mapping->host;
ntfs_inode *ni = NTFS_I(vi);
ntfs_volume *vol = ni->vol;
u8 *kaddr;
unsigned int rec_size = ni->itype.index.block_size;
ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
struct buffer_head *bh, *head, *tbh, *rec_start_bh;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
runlist_element *rl;
int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
unsigned bh_size, rec_size_bits;
bool sync, is_mft, page_is_dirty, rec_is_dirty;
unsigned char bh_size_bits;
ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
"0x%lx.", vi->i_ino, ni->type, page->index);
BUG_ON(!NInoNonResident(ni));
BUG_ON(!NInoMstProtected(ni));
is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
/*
* NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
* in its page cache were to be marked dirty. However this should
* never happen with the current driver and considering we do not
* handle this case here we do want to BUG(), at least for now.
*/
BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
bh_size = vol->sb->s_blocksize;
bh_size_bits = vol->sb->s_blocksize_bits;
max_bhs = PAGE_CACHE_SIZE / bh_size;
BUG_ON(!max_bhs);
BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
/* Were we called for sync purposes? */
sync = (wbc->sync_mode == WB_SYNC_ALL);
/* Make sure we have mapped buffers. */
bh = head = page_buffers(page);
BUG_ON(!bh);
rec_size_bits = ni->itype.index.block_size_bits;
BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
bhs_per_rec = rec_size >> bh_size_bits;
BUG_ON(!bhs_per_rec);
/* The first block in the page. */
rec_block = block = (sector_t)page->index <<
(PAGE_CACHE_SHIFT - bh_size_bits);
/* The first out of bounds block for the data size. */
dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
rl = NULL;
err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
page_is_dirty = rec_is_dirty = false;
rec_start_bh = NULL;
do {
bool is_retry = false;
if (likely(block < rec_block)) {
if (unlikely(block >= dblock)) {
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
}
/*
* This block is not the first one in the record. We
* ignore the buffer's dirty state because we could
* have raced with a parallel mark_ntfs_record_dirty().
*/
if (!rec_is_dirty)
continue;
if (unlikely(err2)) {
if (err2 != -ENOMEM)
clear_buffer_dirty(bh);
continue;
}