forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mft.c
2917 lines (2836 loc) · 99.5 KB
/
mft.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
*
* Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program/include file is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program (in the main directory of the Linux-NTFS
* distribution in the file COPYING); if not, write to the Free Software
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include "attrib.h"
#include "aops.h"
#include "bitmap.h"
#include "debug.h"
#include "dir.h"
#include "lcnalloc.h"
#include "malloc.h"
#include "mft.h"
#include "ntfs.h"
/**
* map_mft_record_page - map the page in which a specific mft record resides
* @ni: ntfs inode whose mft record page to map
*
* This maps the page in which the mft record of the ntfs inode @ni is situated
* and returns a pointer to the mft record within the mapped page.
*
* Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
* contains the negative error code returned.
*/
static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
{
loff_t i_size;
ntfs_volume *vol = ni->vol;
struct inode *mft_vi = vol->mft_ino;
struct page *page;
unsigned long index, end_index;
unsigned ofs;
BUG_ON(ni->page);
/*
* The index into the page cache and the offset within the page cache
* page of the wanted mft record. FIXME: We need to check for
* overflowing the unsigned long, but I don't think we would ever get
* here if the volume was that big...
*/
index = (u64)ni->mft_no << vol->mft_record_size_bits >>
PAGE_CACHE_SHIFT;
ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
i_size = i_size_read(mft_vi);
/* The maximum valid index into the page cache for $MFT's data. */
end_index = i_size >> PAGE_CACHE_SHIFT;
/* If the wanted index is out of bounds the mft record doesn't exist. */
if (unlikely(index >= end_index)) {
if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
vol->mft_record_size) {
page = ERR_PTR(-ENOENT);
ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
"which is beyond the end of the mft. "
"This is probably a bug in the ntfs "
"driver.", ni->mft_no);
goto err_out;
}
}
/* Read, map, and pin the page. */
page = ntfs_map_page(mft_vi->i_mapping, index);
if (likely(!IS_ERR(page))) {
/* Catch multi sector transfer fixup errors. */
if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
ofs)))) {
ni->page = page;
ni->page_ofs = ofs;
return page_address(page) + ofs;
}
ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. "
"Run chkdsk.", ni->mft_no);
ntfs_unmap_page(page);
page = ERR_PTR(-EIO);
NVolSetErrors(vol);
}
err_out:
ni->page = NULL;
ni->page_ofs = 0;
return (void*)page;
}
/**
* map_mft_record - map, pin and lock an mft record
* @ni: ntfs inode whose MFT record to map
*
* First, take the mrec_lock mutex. We might now be sleeping, while waiting
* for the mutex if it was already locked by someone else.
*
* The page of the record is mapped using map_mft_record_page() before being
* returned to the caller.
*
* This in turn uses ntfs_map_page() to get the page containing the wanted mft
* record (it in turn calls read_cache_page() which reads it in from disk if
* necessary, increments the use count on the page so that it cannot disappear
* under us and returns a reference to the page cache page).
*
* If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
* sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
* and the post-read mst fixups on each mft record in the page have been
* performed, the page gets PG_uptodate set and PG_locked cleared (this is done
* in our asynchronous I/O completion handler end_buffer_read_mft_async()).
* ntfs_map_page() waits for PG_locked to become clear and checks if
* PG_uptodate is set and returns an error code if not. This provides
* sufficient protection against races when reading/using the page.
*
* However there is the write mapping to think about. Doing the above described
* checking here will be fine, because when initiating the write we will set
* PG_locked and clear PG_uptodate making sure nobody is touching the page
* contents. Doing the locking this way means that the commit to disk code in
* the page cache code paths is automatically sufficiently locked with us as
* we will not touch a page that has been locked or is not uptodate. The only
* locking problem then is them locking the page while we are accessing it.
*
* So that code will end up having to own the mrec_lock of all mft
* records/inodes present in the page before I/O can proceed. In that case we
* wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
* accessing anything without owning the mrec_lock mutex. But we do need to
* use them because of the read_cache_page() invocation and the code becomes so
* much simpler this way that it is well worth it.
*
* The mft record is now ours and we return a pointer to it. You need to check
* the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
* the error code.
*
* NOTE: Caller is responsible for setting the mft record dirty before calling
* unmap_mft_record(). This is obviously only necessary if the caller really
* modified the mft record...
* Q: Do we want to recycle one of the VFS inode state bits instead?
* A: No, the inode ones mean we want to change the mft record, not we want to
* write it out.
*/
MFT_RECORD *map_mft_record(ntfs_inode *ni)
{
MFT_RECORD *m;
ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
/* Make sure the ntfs inode doesn't go away. */
atomic_inc(&ni->count);
/* Serialize access to this mft record. */
mutex_lock(&ni->mrec_lock);
m = map_mft_record_page(ni);
if (likely(!IS_ERR(m)))
return m;
mutex_unlock(&ni->mrec_lock);
atomic_dec(&ni->count);
ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
return m;
}
/**
* unmap_mft_record_page - unmap the page in which a specific mft record resides
* @ni: ntfs inode whose mft record page to unmap
*
* This unmaps the page in which the mft record of the ntfs inode @ni is
* situated and returns. This is a NOOP if highmem is not configured.
*
* The unmap happens via ntfs_unmap_page() which in turn decrements the use
* count on the page thus releasing it from the pinned state.
*
* We do not actually unmap the page from memory of course, as that will be
* done by the page cache code itself when memory pressure increases or
* whatever.
*/
static inline void unmap_mft_record_page(ntfs_inode *ni)
{
BUG_ON(!ni->page);
// TODO: If dirty, blah...
ntfs_unmap_page(ni->page);
ni->page = NULL;
ni->page_ofs = 0;
return;
}
/**
* unmap_mft_record - release a mapped mft record
* @ni: ntfs inode whose MFT record to unmap
*
* We release the page mapping and the mrec_lock mutex which unmaps the mft
* record and releases it for others to get hold of. We also release the ntfs
* inode by decrementing the ntfs inode reference count.
*
* NOTE: If caller has modified the mft record, it is imperative to set the mft
* record dirty BEFORE calling unmap_mft_record().
*/
void unmap_mft_record(ntfs_inode *ni)
{
struct page *page = ni->page;
BUG_ON(!page);
ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
unmap_mft_record_page(ni);
mutex_unlock(&ni->mrec_lock);
atomic_dec(&ni->count);
/*
* If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
* ntfs_clear_extent_inode() in the extent inode case, and to the
* caller in the non-extent, yet pure ntfs inode case, to do the actual
* tear down of all structures and freeing of all allocated memory.
*/
return;
}
/**
* map_extent_mft_record - load an extent inode and attach it to its base
* @base_ni: base ntfs inode
* @mref: mft reference of the extent inode to load
* @ntfs_ino: on successful return, pointer to the ntfs_inode structure
*
* Load the extent mft record @mref and attach it to its base inode @base_ni.
* Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
* PTR_ERR(result) gives the negative error code.
*
* On successful return, @ntfs_ino contains a pointer to the ntfs_inode
* structure of the mapped extent inode.
*/
MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
ntfs_inode **ntfs_ino)
{
MFT_RECORD *m;
ntfs_inode *ni = NULL;
ntfs_inode **extent_nis = NULL;
int i;
unsigned long mft_no = MREF(mref);
u16 seq_no = MSEQNO(mref);
bool destroy_ni = false;
ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
mft_no, base_ni->mft_no);
/* Make sure the base ntfs inode doesn't go away. */
atomic_inc(&base_ni->count);
/*
* Check if this extent inode has already been added to the base inode,
* in which case just return it. If not found, add it to the base
* inode before returning it.
*/
mutex_lock(&base_ni->extent_lock);
if (base_ni->nr_extents > 0) {
extent_nis = base_ni->ext.extent_ntfs_inos;
for (i = 0; i < base_ni->nr_extents; i++) {
if (mft_no != extent_nis[i]->mft_no)
continue;
ni = extent_nis[i];
/* Make sure the ntfs inode doesn't go away. */
atomic_inc(&ni->count);
break;
}
}
if (likely(ni != NULL)) {
mutex_unlock(&base_ni->extent_lock);
atomic_dec(&base_ni->count);
/* We found the record; just have to map and return it. */
m = map_mft_record(ni);
/* map_mft_record() has incremented this on success. */
atomic_dec(&ni->count);
if (likely(!IS_ERR(m))) {
/* Verify the sequence number. */
if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
ntfs_debug("Done 1.");
*ntfs_ino = ni;
return m;
}
unmap_mft_record(ni);
ntfs_error(base_ni->vol->sb, "Found stale extent mft "
"reference! Corrupt filesystem. "
"Run chkdsk.");
return ERR_PTR(-EIO);
}
map_err_out:
ntfs_error(base_ni->vol->sb, "Failed to map extent "
"mft record, error code %ld.", -PTR_ERR(m));
return m;
}
/* Record wasn't there. Get a new ntfs inode and initialize it. */
ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
if (unlikely(!ni)) {
mutex_unlock(&base_ni->extent_lock);
atomic_dec(&base_ni->count);
return ERR_PTR(-ENOMEM);
}
ni->vol = base_ni->vol;
ni->seq_no = seq_no;
ni->nr_extents = -1;
ni->ext.base_ntfs_ino = base_ni;
/* Now map the record. */
m = map_mft_record(ni);
if (IS_ERR(m)) {
mutex_unlock(&base_ni->extent_lock);
atomic_dec(&base_ni->count);
ntfs_clear_extent_inode(ni);
goto map_err_out;
}
/* Verify the sequence number if it is present. */
if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
ntfs_error(base_ni->vol->sb, "Found stale extent mft "
"reference! Corrupt filesystem. Run chkdsk.");
destroy_ni = true;
m = ERR_PTR(-EIO);
goto unm_err_out;
}
/* Attach extent inode to base inode, reallocating memory if needed. */
if (!(base_ni->nr_extents & 3)) {
ntfs_inode **tmp;
int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
tmp = kmalloc(new_size, GFP_NOFS);
if (unlikely(!tmp)) {
ntfs_error(base_ni->vol->sb, "Failed to allocate "
"internal buffer.");
destroy_ni = true;
m = ERR_PTR(-ENOMEM);
goto unm_err_out;
}
if (base_ni->nr_extents) {
BUG_ON(!base_ni->ext.extent_ntfs_inos);
memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
4 * sizeof(ntfs_inode *));
kfree(base_ni->ext.extent_ntfs_inos);
}
base_ni->ext.extent_ntfs_inos = tmp;
}
base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
mutex_unlock(&base_ni->extent_lock);
atomic_dec(&base_ni->count);
ntfs_debug("Done 2.");
*ntfs_ino = ni;
return m;
unm_err_out:
unmap_mft_record(ni);
mutex_unlock(&base_ni->extent_lock);
atomic_dec(&base_ni->count);
/*
* If the extent inode was not attached to the base inode we need to
* release it or we will leak memory.
*/
if (destroy_ni)
ntfs_clear_extent_inode(ni);
return m;
}
#ifdef NTFS_RW
/**
* __mark_mft_record_dirty - set the mft record and the page containing it dirty
* @ni: ntfs inode describing the mapped mft record
*
* Internal function. Users should call mark_mft_record_dirty() instead.
*
* Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
* as well as the page containing the mft record, dirty. Also, mark the base
* vfs inode dirty. This ensures that any changes to the mft record are
* written out to disk.
*
* NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
* on the base vfs inode, because even though file data may have been modified,
* it is dirty in the inode meta data rather than the data page cache of the
* inode, and thus there are no data pages that need writing out. Therefore, a
* full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
* other hand, is not sufficient, because ->write_inode needs to be called even
* in case of fdatasync. This needs to happen or the file data would not
* necessarily hit the device synchronously, even though the vfs inode has the
* O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
* I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
* which is not what I_DIRTY_SYNC on its own would suggest.
*/
void __mark_mft_record_dirty(ntfs_inode *ni)
{
ntfs_inode *base_ni;
ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
BUG_ON(NInoAttr(ni));
mark_ntfs_record_dirty(ni->page, ni->page_ofs);
/* Determine the base vfs inode and mark it dirty, too. */
mutex_lock(&ni->extent_lock);
if (likely(ni->nr_extents >= 0))
base_ni = ni;
else
base_ni = ni->ext.base_ntfs_ino;
mutex_unlock(&ni->extent_lock);
__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
}
static const char *ntfs_please_email = "Please email "
"[email protected] and say that you saw "
"this message. Thank you.";
/**
* ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
* @vol: ntfs volume on which the mft record to synchronize resides
* @mft_no: mft record number of mft record to synchronize
* @m: mapped, mst protected (extent) mft record to synchronize
*
* Write the mapped, mst protected (extent) mft record @m with mft record
* number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
* bypassing the page cache and the $MFTMirr inode itself.
*
* This function is only for use at umount time when the mft mirror inode has
* already been disposed off. We BUG() if we are called while the mft mirror
* inode is still attached to the volume.
*
* On success return 0. On error return -errno.
*
* NOTE: This function is not implemented yet as I am not convinced it can
* actually be triggered considering the sequence of commits we do in super.c::
* ntfs_put_super(). But just in case we provide this place holder as the
* alternative would be either to BUG() or to get a NULL pointer dereference
* and Oops.
*/
static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
const unsigned long mft_no, MFT_RECORD *m)
{
BUG_ON(vol->mftmirr_ino);
ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
"implemented yet. %s", ntfs_please_email);
return -EOPNOTSUPP;
}
/**
* ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
* @vol: ntfs volume on which the mft record to synchronize resides
* @mft_no: mft record number of mft record to synchronize
* @m: mapped, mst protected (extent) mft record to synchronize
* @sync: if true, wait for i/o completion
*
* Write the mapped, mst protected (extent) mft record @m with mft record
* number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
*
* On success return 0. On error return -errno and set the volume errors flag
* in the ntfs volume @vol.
*
* NOTE: We always perform synchronous i/o and ignore the @sync parameter.
*
* TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
* schedule i/o via ->writepage or do it via kntfsd or whatever.
*/
int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
MFT_RECORD *m, int sync)
{
struct page *page;
unsigned int blocksize = vol->sb->s_blocksize;
int max_bhs = vol->mft_record_size / blocksize;
struct buffer_head *bhs[max_bhs];
struct buffer_head *bh, *head;
u8 *kmirr;
runlist_element *rl;
unsigned int block_start, block_end, m_start, m_end, page_ofs;
int i_bhs, nr_bhs, err = 0;
unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
ntfs_debug("Entering for inode 0x%lx.", mft_no);
BUG_ON(!max_bhs);
if (unlikely(!vol->mftmirr_ino)) {
/* This could happen during umount... */
err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
if (likely(!err))
return err;
goto err_out;
}
/* Get the page containing the mirror copy of the mft record @m. */
page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
if (IS_ERR(page)) {
ntfs_error(vol->sb, "Failed to map mft mirror page.");
err = PTR_ERR(page);
goto err_out;
}
lock_page(page);
BUG_ON(!PageUptodate(page));
ClearPageUptodate(page);
/* Offset of the mft mirror record inside the page. */
page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
/* The address in the page of the mirror copy of the mft record @m. */
kmirr = page_address(page) + page_ofs;
/* Copy the mst protected mft record to the mirror. */
memcpy(kmirr, m, vol->mft_record_size);
/* Create uptodate buffers if not present. */
if (unlikely(!page_has_buffers(page))) {
struct buffer_head *tail;
bh = head = alloc_page_buffers(page, blocksize, 1);
do {
set_buffer_uptodate(bh);
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
attach_page_buffers(page, head);
}
bh = head = page_buffers(page);
BUG_ON(!bh);
rl = NULL;
nr_bhs = 0;
block_start = 0;
m_start = kmirr - (u8*)page_address(page);
m_end = m_start + vol->mft_record_size;
do {
block_end = block_start + blocksize;
/* If the buffer is outside the mft record, skip it. */
if (block_end <= m_start)
continue;
if (unlikely(block_start >= m_end))
break;
/* Need to map the buffer if it is not mapped already. */
if (unlikely(!buffer_mapped(bh))) {
VCN vcn;
LCN lcn;
unsigned int vcn_ofs;
bh->b_bdev = vol->sb->s_bdev;
/* Obtain the vcn and offset of the current block. */
vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
(block_start - m_start);
vcn_ofs = vcn & vol->cluster_size_mask;
vcn >>= vol->cluster_size_bits;
if (!rl) {
down_read(&NTFS_I(vol->mftmirr_ino)->
runlist.lock);
rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
/*
* $MFTMirr always has the whole of its runlist
* in memory.
*/
BUG_ON(!rl);
}
/* Seek to element containing target vcn. */
while (rl->length && rl[1].vcn <= vcn)
rl++;
lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
/* For $MFTMirr, only lcn >= 0 is a successful remap. */
if (likely(lcn >= 0)) {
/* Setup buffer head to correct block. */
bh->b_blocknr = ((lcn <<
vol->cluster_size_bits) +
vcn_ofs) >> blocksize_bits;
set_buffer_mapped(bh);
} else {
bh->b_blocknr = -1;
ntfs_error(vol->sb, "Cannot write mft mirror "
"record 0x%lx because its "
"location on disk could not "
"be determined (error code "
"%lli).", mft_no,
(long long)lcn);
err = -EIO;
}
}
BUG_ON(!buffer_uptodate(bh));
BUG_ON(!nr_bhs && (m_start != block_start));
BUG_ON(nr_bhs >= max_bhs);
bhs[nr_bhs++] = bh;
BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
} while (block_start = block_end, (bh = bh->b_this_page) != head);
if (unlikely(rl))
up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
if (likely(!err)) {
/* Lock buffers and start synchronous write i/o on them. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
if (!trylock_buffer(tbh))
BUG();
BUG_ON(!buffer_uptodate(tbh));
clear_buffer_dirty(tbh);
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, tbh);
}
/* Wait on i/o completion of buffers. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
wait_on_buffer(tbh);
if (unlikely(!buffer_uptodate(tbh))) {
err = -EIO;
/*
* Set the buffer uptodate so the page and
* buffer states do not become out of sync.
*/
set_buffer_uptodate(tbh);
}
}
} else /* if (unlikely(err)) */ {
/* Clean the buffers. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
clear_buffer_dirty(bhs[i_bhs]);
}
/* Current state: all buffers are clean, unlocked, and uptodate. */
/* Remove the mst protection fixups again. */
post_write_mst_fixup((NTFS_RECORD*)kmirr);
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
ntfs_unmap_page(page);
if (likely(!err)) {
ntfs_debug("Done.");
} else {
ntfs_error(vol->sb, "I/O error while writing mft mirror "
"record 0x%lx!", mft_no);
err_out:
ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
"code %i). Volume will be left marked dirty "
"on umount. Run ntfsfix on the partition "
"after umounting to correct this.", -err);
NVolSetErrors(vol);
}
return err;
}
/**
* write_mft_record_nolock - write out a mapped (extent) mft record
* @ni: ntfs inode describing the mapped (extent) mft record
* @m: mapped (extent) mft record to write
* @sync: if true, wait for i/o completion
*
* Write the mapped (extent) mft record @m described by the (regular or extent)
* ntfs inode @ni to backing store. If the mft record @m has a counterpart in
* the mft mirror, that is also updated.
*
* We only write the mft record if the ntfs inode @ni is dirty and the first
* buffer belonging to its mft record is dirty, too. We ignore the dirty state
* of subsequent buffers because we could have raced with
* fs/ntfs/aops.c::mark_ntfs_record_dirty().
*
* On success, clean the mft record and return 0. On error, leave the mft
* record dirty and return -errno.
*
* NOTE: We always perform synchronous i/o and ignore the @sync parameter.
* However, if the mft record has a counterpart in the mft mirror and @sync is
* true, we write the mft record, wait for i/o completion, and only then write
* the mft mirror copy. This ensures that if the system crashes either the mft
* or the mft mirror will contain a self-consistent mft record @m. If @sync is
* false on the other hand, we start i/o on both and then wait for completion
* on them. This provides a speedup but no longer guarantees that you will end
* up with a self-consistent mft record in the case of a crash but if you asked
* for asynchronous writing you probably do not care about that anyway.
*
* TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
* schedule i/o via ->writepage or do it via kntfsd or whatever.
*/
int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
{
ntfs_volume *vol = ni->vol;
struct page *page = ni->page;
unsigned int blocksize = vol->sb->s_blocksize;
unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
int max_bhs = vol->mft_record_size / blocksize;
struct buffer_head *bhs[max_bhs];
struct buffer_head *bh, *head;
runlist_element *rl;
unsigned int block_start, block_end, m_start, m_end;
int i_bhs, nr_bhs, err = 0;
ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
BUG_ON(NInoAttr(ni));
BUG_ON(!max_bhs);
BUG_ON(!PageLocked(page));
/*
* If the ntfs_inode is clean no need to do anything. If it is dirty,
* mark it as clean now so that it can be redirtied later on if needed.
* There is no danger of races since the caller is holding the locks
* for the mft record @m and the page it is in.
*/
if (!NInoTestClearDirty(ni))
goto done;
bh = head = page_buffers(page);
BUG_ON(!bh);
rl = NULL;
nr_bhs = 0;
block_start = 0;
m_start = ni->page_ofs;
m_end = m_start + vol->mft_record_size;
do {
block_end = block_start + blocksize;
/* If the buffer is outside the mft record, skip it. */
if (block_end <= m_start)
continue;
if (unlikely(block_start >= m_end))
break;
/*
* If this block is not the first one in the record, we ignore
* the buffer's dirty state because we could have raced with a
* parallel mark_ntfs_record_dirty().
*/
if (block_start == m_start) {
/* This block is the first one in the record. */
if (!buffer_dirty(bh)) {
BUG_ON(nr_bhs);
/* Clean records are not written out. */
break;
}
}
/* Need to map the buffer if it is not mapped already. */
if (unlikely(!buffer_mapped(bh))) {
VCN vcn;
LCN lcn;
unsigned int vcn_ofs;
bh->b_bdev = vol->sb->s_bdev;
/* Obtain the vcn and offset of the current block. */
vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
(block_start - m_start);
vcn_ofs = vcn & vol->cluster_size_mask;
vcn >>= vol->cluster_size_bits;
if (!rl) {
down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
rl = NTFS_I(vol->mft_ino)->runlist.rl;
BUG_ON(!rl);
}
/* Seek to element containing target vcn. */
while (rl->length && rl[1].vcn <= vcn)
rl++;
lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
/* For $MFT, only lcn >= 0 is a successful remap. */
if (likely(lcn >= 0)) {
/* Setup buffer head to correct block. */
bh->b_blocknr = ((lcn <<
vol->cluster_size_bits) +
vcn_ofs) >> blocksize_bits;
set_buffer_mapped(bh);
} else {
bh->b_blocknr = -1;
ntfs_error(vol->sb, "Cannot write mft record "
"0x%lx because its location "
"on disk could not be "
"determined (error code %lli).",
ni->mft_no, (long long)lcn);
err = -EIO;
}
}
BUG_ON(!buffer_uptodate(bh));
BUG_ON(!nr_bhs && (m_start != block_start));
BUG_ON(nr_bhs >= max_bhs);
bhs[nr_bhs++] = bh;
BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
} while (block_start = block_end, (bh = bh->b_this_page) != head);
if (unlikely(rl))
up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
if (!nr_bhs)
goto done;
if (unlikely(err))
goto cleanup_out;
/* Apply the mst protection fixups. */
err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
if (err) {
ntfs_error(vol->sb, "Failed to apply mst fixups!");
goto cleanup_out;
}
flush_dcache_mft_record_page(ni);
/* Lock buffers and start synchronous write i/o on them. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
if (!trylock_buffer(tbh))
BUG();
BUG_ON(!buffer_uptodate(tbh));
clear_buffer_dirty(tbh);
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, tbh);
}
/* Synchronize the mft mirror now if not @sync. */
if (!sync && ni->mft_no < vol->mftmirr_size)
ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
/* Wait on i/o completion of buffers. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
wait_on_buffer(tbh);
if (unlikely(!buffer_uptodate(tbh))) {
err = -EIO;
/*
* Set the buffer uptodate so the page and buffer
* states do not become out of sync.
*/
if (PageUptodate(page))
set_buffer_uptodate(tbh);
}
}
/* If @sync, now synchronize the mft mirror. */
if (sync && ni->mft_no < vol->mftmirr_size)
ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
/* Remove the mst protection fixups again. */
post_write_mst_fixup((NTFS_RECORD*)m);
flush_dcache_mft_record_page(ni);
if (unlikely(err)) {
/* I/O error during writing. This is really bad! */
ntfs_error(vol->sb, "I/O error while writing mft record "
"0x%lx! Marking base inode as bad. You "
"should unmount the volume and run chkdsk.",
ni->mft_no);
goto err_out;
}
done:
ntfs_debug("Done.");
return 0;
cleanup_out:
/* Clean the buffers. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
clear_buffer_dirty(bhs[i_bhs]);
err_out:
/*
* Current state: all buffers are clean, unlocked, and uptodate.
* The caller should mark the base inode as bad so that no more i/o
* happens. ->clear_inode() will still be invoked so all extent inodes
* and other allocated memory will be freed.
*/
if (err == -ENOMEM) {
ntfs_error(vol->sb, "Not enough memory to write mft record. "
"Redirtying so the write is retried later.");
mark_mft_record_dirty(ni);
err = 0;
} else
NVolSetErrors(vol);
return err;
}
/**
* ntfs_may_write_mft_record - check if an mft record may be written out
* @vol: [IN] ntfs volume on which the mft record to check resides
* @mft_no: [IN] mft record number of the mft record to check
* @m: [IN] mapped mft record to check
* @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned
*
* Check if the mapped (base or extent) mft record @m with mft record number
* @mft_no belonging to the ntfs volume @vol may be written out. If necessary
* and possible the ntfs inode of the mft record is locked and the base vfs
* inode is pinned. The locked ntfs inode is then returned in @locked_ni. The
* caller is responsible for unlocking the ntfs inode and unpinning the base
* vfs inode.
*
* Return 'true' if the mft record may be written out and 'false' if not.
*
* The caller has locked the page and cleared the uptodate flag on it which
* means that we can safely write out any dirty mft records that do not have
* their inodes in icache as determined by ilookup5() as anyone
* opening/creating such an inode would block when attempting to map the mft
* record in read_cache_page() until we are finished with the write out.
*
* Here is a description of the tests we perform:
*
* If the inode is found in icache we know the mft record must be a base mft
* record. If it is dirty, we do not write it and return 'false' as the vfs
* inode write paths will result in the access times being updated which would
* cause the base mft record to be redirtied and written out again. (We know
* the access time update will modify the base mft record because Windows
* chkdsk complains if the standard information attribute is not in the base
* mft record.)
*
* If the inode is in icache and not dirty, we attempt to lock the mft record
* and if we find the lock was already taken, it is not safe to write the mft
* record and we return 'false'.
*
* If we manage to obtain the lock we have exclusive access to the mft record,
* which also allows us safe writeout of the mft record. We then set
* @locked_ni to the locked ntfs inode and return 'true'.
*
* Note we cannot just lock the mft record and sleep while waiting for the lock
* because this would deadlock due to lock reversal (normally the mft record is
* locked before the page is locked but we already have the page locked here
* when we try to lock the mft record).
*
* If the inode is not in icache we need to perform further checks.
*
* If the mft record is not a FILE record or it is a base mft record, we can
* safely write it and return 'true'.
*
* We now know the mft record is an extent mft record. We check if the inode
* corresponding to its base mft record is in icache and obtain a reference to
* it if it is. If it is not, we can safely write it and return 'true'.
*
* We now have the base inode for the extent mft record. We check if it has an
* ntfs inode for the extent mft record attached and if not it is safe to write
* the extent mft record and we return 'true'.
*
* The ntfs inode for the extent mft record is attached to the base inode so we
* attempt to lock the extent mft record and if we find the lock was already
* taken, it is not safe to write the extent mft record and we return 'false'.
*
* If we manage to obtain the lock we have exclusive access to the extent mft
* record, which also allows us safe writeout of the extent mft record. We
* set the ntfs inode of the extent mft record clean and then set @locked_ni to
* the now locked ntfs inode and return 'true'.
*
* Note, the reason for actually writing dirty mft records here and not just
* relying on the vfs inode dirty code paths is that we can have mft records
* modified without them ever having actual inodes in memory. Also we can have
* dirty mft records with clean ntfs inodes in memory. None of the described
* cases would result in the dirty mft records being written out if we only
* relied on the vfs inode dirty code paths. And these cases can really occur
* during allocation of new mft records and in particular when the
* initialized_size of the $MFT/$DATA attribute is extended and the new space
* is initialized using ntfs_mft_record_format(). The clean inode can then
* appear if the mft record is reused for a new inode before it got written
* out.
*/
bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
const MFT_RECORD *m, ntfs_inode **locked_ni)
{
struct super_block *sb = vol->sb;
struct inode *mft_vi = vol->mft_ino;
struct inode *vi;
ntfs_inode *ni, *eni, **extent_nis;
int i;
ntfs_attr na;
ntfs_debug("Entering for inode 0x%lx.", mft_no);
/*
* Normally we do not return a locked inode so set @locked_ni to NULL.
*/
BUG_ON(!locked_ni);
*locked_ni = NULL;
/*
* Check if the inode corresponding to this mft record is in the VFS
* inode cache and obtain a reference to it if it is.
*/
ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
na.mft_no = mft_no;
na.name = NULL;
na.name_len = 0;
na.type = AT_UNUSED;
/*
* Optimize inode 0, i.e. $MFT itself, since we have it in memory and
* we get here for it rather often.
*/
if (!mft_no) {
/* Balance the below iput(). */
vi = igrab(mft_vi);
BUG_ON(vi != mft_vi);
} else {
/*
* Have to use ilookup5_nowait() since ilookup5() waits for the
* inode lock which causes ntfs to deadlock when a concurrent
* inode write via the inode dirty code paths and the page
* dirty code path of the inode dirty code path when writing
* $MFT occurs.
*/
vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
}
if (vi) {
ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
/* The inode is in icache. */
ni = NTFS_I(vi);
/* Take a reference to the ntfs inode. */
atomic_inc(&ni->count);
/* If the inode is dirty, do not write this record. */
if (NInoDirty(ni)) {
ntfs_debug("Inode 0x%lx is dirty, do not write it.",
mft_no);
atomic_dec(&ni->count);
iput(vi);
return false;
}
ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
/* The inode is not dirty, try to take the mft record lock. */
if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
ntfs_debug("Mft record 0x%lx is already locked, do "
"not write it.", mft_no);
atomic_dec(&ni->count);
iput(vi);
return false;
}
ntfs_debug("Managed to lock mft record 0x%lx, write it.",
mft_no);
/*
* The write has to occur while we hold the mft record lock so
* return the locked ntfs inode.
*/
*locked_ni = ni;
return true;
}
ntfs_debug("Inode 0x%lx is not in icache.", mft_no);