1 #include "clusterautoconfig.h"
2
3 #include <inttypes.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <sys/types.h>
8 #include <sys/stat.h>
9 #include <unistd.h>
10 #include <libintl.h>
11 #include <ctype.h>
12 #include <fcntl.h>
13 #define _(String) gettext(String)
14
15 #include <logging.h>
16 #include "libgfs2.h"
17 #include "link.h"
18 #include "osi_tree.h"
19 #include "fsck.h"
20 #include "util.h"
21 #include "metawalk.h"
22 #include "inode_hash.h"
23
24 #define COMFORTABLE_BLKS 5242880 /* 20GB in 4K blocks */
25
26 /* There are two bitmaps: (1) The "blockmap" that fsck uses to keep track of
27 what block type has been discovered, and (2) The rgrp bitmap. Function
28 blockmap_set is used to set the former and gfs2_set_bitmap
29 is used to set the latter. The two must be kept in sync, otherwise
30 you'll get bitmap mismatches. This function checks the status of the
31 bitmap whenever the blockmap changes, and fixes it accordingly. */
32 int check_n_fix_bitmap(struct fsck_cx *cx, struct lgfs2_rgrp_tree *rgd,
33 uint64_t blk, int error_on_dinode, int new_state)
34 {
35 struct lgfs2_sbd *sdp = cx->sdp;
36 int old_state;
37 int treat_as_inode = 0;
38 int rewrite_rgrp = 0;
39 static struct lgfs2_rgrp_tree *prevrgd = NULL;
40
41 if (prevrgd && rgrp_contains_block(prevrgd, blk)) {
42 rgd = prevrgd;
43 } else if (rgd == NULL || !rgrp_contains_block(rgd, blk)) {
44 rgd = lgfs2_blk2rgrpd(sdp, blk);
45 prevrgd = rgd;
46 }
47 old_state = lgfs2_get_bitmap(sdp, blk, rgd);
48 if (old_state < 0) {
49 log_err(_("Block %"PRIu64" (0x%"PRIx64") is not represented in the "
50 "system bitmap; part of an rgrp or superblock.\n"),
51 blk, blk);
52 return -1;
53 }
54 if (old_state == new_state)
55 return 0;
56
57 if (error_on_dinode && old_state == GFS2_BLKST_DINODE &&
58 new_state != GFS2_BLKST_FREE) {
59 log_debug(_("Reference as '%s' to block %"PRIu64" (0x%"PRIx64") which "
60 "was marked as dinode. Needs further investigation.\n"),
61 lgfs2_blkst_str(new_state), blk, blk);
62 return 1;
63 }
64 /* Keep these messages as short as possible, or the output gets to be
65 huge and unmanageable. */
66 log_err(_("Block %"PRIu64" (0x%"PRIx64") was '%s', should be %s.\n"),
67 blk, blk, lgfs2_blkst_str(old_state), lgfs2_blkst_str(new_state));
68 if (!query(cx, _("Fix the bitmap? (y/n)"))) {
69 log_err( _("The bitmap inconsistency was ignored.\n"));
70 return 0;
71 }
72 /* If the new bitmap state is free (and therefore the old state was
73 not) we have to add to the free space in the rgrp. If the old
74 bitmap state was free (and therefore it no longer is) we have to
75 subtract to the free space. If the type changed from dinode to
76 data or data to dinode, no change in free space. */
77 lgfs2_set_bitmap(rgd, blk, new_state);
78 if (new_state == GFS2_BLKST_FREE) {
79 rgd->rt_free++;
80 rewrite_rgrp = 1;
81 } else if (old_state == GFS2_BLKST_FREE) {
82 rgd->rt_free--;
83 rewrite_rgrp = 1;
84 }
85 /* If we're freeing a dinode, get rid of the data structs for it. */
86 if (old_state == GFS2_BLKST_DINODE ||
87 old_state == GFS2_BLKST_UNLINKED) {
88 struct dir_info *dt;
89 struct inode_info *ii;
90
91 dt = dirtree_find(cx, blk);
92 if (dt) {
93 dirtree_delete(cx, dt);
94 treat_as_inode = 1;
95 }
96 ii = inodetree_find(cx, blk);
97 if (ii) {
98 inodetree_delete(cx, ii);
99 treat_as_inode = 1;
100 } else {
101 treat_as_inode = 1;
102 }
103 if (old_state == GFS2_BLKST_DINODE) {
104 if (treat_as_inode && rgd->rt_dinodes > 0)
105 rgd->rt_dinodes--;
106 rewrite_rgrp = 1;
107 }
108 link1_set(&nlink1map, blk, 0);
109 } else if (new_state == GFS2_BLKST_DINODE) {
110 rgd->rt_dinodes++;
111 rewrite_rgrp = 1;
112 }
113 if (rewrite_rgrp) {
114 lgfs2_rgrp_out(rgd, rgd->rt_bits[0].bi_data);
115 rgd->rt_bits[0].bi_modified = 1;
116 }
117 log_err( _("The bitmap was fixed.\n"));
118 return 0;
119 }
120
121 /*
122 * _fsck_bitmap_set - Mark a block in the bitmap, and adjust free space.
123 */
124 int _fsck_bitmap_set(struct fsck_cx *cx, struct lgfs2_inode *ip, uint64_t bblock,
125 const char *btype, int mark,
126 int error_on_dinode, const char *caller, int fline)
127 {
128 int error;
129 static int prev_ino_addr = 0;
130 static int prev_mark = 0;
131 static int prevcount = 0;
132 static const char *prev_caller = NULL;
133
134 if (print_level >= MSG_DEBUG) {
135 if ((ip->i_num.in_addr == prev_ino_addr) &&
136 (mark == prev_mark) && caller == prev_caller) {
137 log_info("(0x%"PRIx64") ", bblock);
138 prevcount++;
139 if (prevcount > 10) {
140 log_info("\n");
141 prevcount = 0;
142 }
143 /* I'm circumventing the log levels here on purpose to make the
144 output easier to debug. */
145 } else if (ip->i_num.in_addr == bblock) {
146 if (prevcount) {
147 log_info("\n");
148 prevcount = 0;
149 }
150 printf(_("(%s:%d) %s inode found at block (0x%"PRIx64"): marking as '%s'\n"),
151 caller, fline, btype, ip->i_num.in_addr, block_type_string(mark));
152 } else {
153 if (prevcount) {
154 log_info("\n");
155 prevcount = 0;
156 }
157 printf(_("(%s:%d) inode (0x%"PRIx64") references %s block"
158 " (0x%"PRIx64"): marking as '%s'\n"),
159 caller, fline, ip->i_num.in_addr, btype, bblock, block_type_string(mark));
160 }
161 prev_ino_addr = ip->i_num.in_addr;
162 prev_mark = mark;
163 prev_caller = caller;
164 }
165 error = check_n_fix_bitmap(cx, ip->i_rgd, bblock,
166 error_on_dinode, mark);
167 if (error < 0)
168 log_err(_("This block is not represented in the bitmap.\n"));
169 return error;
170 }
171
172 struct duptree *dupfind(struct fsck_cx *cx, uint64_t block)
173 {
174 struct osi_node *node = cx->dup_blocks.osi_node;
175
176 while (node) {
177 struct duptree *dt = (struct duptree *)node;
178
179 if (block < dt->block)
180 node = node->osi_left;
181 else if (block > dt->block)
182 node = node->osi_right;
183 else
184 return dt;
185 }
186 return NULL;
187 }
188
189 struct lgfs2_inode *fsck_system_inode(struct lgfs2_sbd *sdp, uint64_t block)
190 {
191 if (lf_dip && lf_dip->i_num.in_addr == block)
192 return lf_dip;
193 return lgfs2_is_system_inode(sdp, block);
194 }
195
196 /* fsck_load_inode - same as gfs2_load_inode() in libgfs2 but system inodes
197 get special treatment. */
198 struct lgfs2_inode *fsck_load_inode(struct lgfs2_sbd *sdp, uint64_t block)
199 {
200 struct lgfs2_inode *ip = NULL;
201
202 ip = fsck_system_inode(sdp, block);
203 if (ip)
204 return ip;
205 return lgfs2_inode_read(sdp, block);
206 }
207
208 /* fsck_inode_get - same as inode_get() in libgfs2 but system inodes
209 get special treatment. */
210 struct lgfs2_inode *fsck_inode_get(struct lgfs2_sbd *sdp, struct lgfs2_rgrp_tree *rgd,
211 struct lgfs2_buffer_head *bh)
212 {
213 struct lgfs2_inode *sysip;
214 struct lgfs2_inode *ip;
215
216 sysip = fsck_system_inode(sdp, bh->b_blocknr);
217 if (sysip)
218 return sysip;
219
220 ip = lgfs2_inode_get(sdp, bh);
221 if (ip) {
222 ip->i_rgd = rgd;
223 ip->i_bh = bh;
224 }
225 return ip;
226 }
227
228 /* fsck_inode_put - same as lgfs2_inode_put() in libgfs2 but system inodes
229 get special treatment. */
230 void fsck_inode_put(struct lgfs2_inode **ip_in)
231 {
232 struct lgfs2_inode *ip = *ip_in;
233 struct lgfs2_inode *sysip;
234
235 sysip = fsck_system_inode(ip->i_sbd, ip->i_num.in_addr);
236 if (!sysip)
237 lgfs2_inode_put(ip_in);
238 }
239
240 /**
241 * dirent_repair - attempt to repair a corrupt directory entry.
242 * @bh - The buffer header that contains the bad dirent
243 * @dh - The directory entry in native format
244 * @dent - The directory entry in on-disk format
245 * @type - Type of directory (DIR_LINEAR or DIR_EXHASH)
246 * @first - TRUE if this is the first dirent in the buffer
247 *
248 * This function tries to repair a corrupt directory entry. All we
249 * know at this point is that the length field is wrong.
250 */
251 static int dirent_repair(struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
252 struct lgfs2_dirent *d, struct gfs2_dirent *dent,
253 int type, int first)
254 {
255 char *bh_end, *p;
256 int calc_de_name_len = 0;
257
258 /* If this is a sentinel, just fix the length and move on */
259 if (first && !d->dr_inum.in_formal_ino) { /* Is it a sentinel? */
260 if (type == DIR_LINEAR)
261 d->dr_rec_len = ip->i_sbd->sd_bsize -
262 sizeof(struct gfs2_dinode);
263 else
264 d->dr_rec_len = ip->i_sbd->sd_bsize -
265 sizeof(struct gfs2_leaf);
266 } else {
267 bh_end = bh->b_data + ip->i_sbd->sd_bsize;
268 /* first, figure out a probable name length */
269 p = (char *)dent + sizeof(struct gfs2_dirent);
270 while (*p && /* while there's a non-zero char and */
271 isprint(*p) && /* a printable character and */
272 p < bh_end) { /* not past end of buffer */
273 calc_de_name_len++;
274 p++;
275 }
276 if (!calc_de_name_len)
277 return 1;
278 /* There can often be noise at the end, so only */
279 /* Trust the shorter of the two in case we have too much */
280 /* Or rather, only trust ours if it's shorter. */
281 if (!d->dr_name_len || d->dr_name_len > GFS2_FNAMESIZE ||
282 calc_de_name_len < d->dr_name_len) /* if dent is hosed */
283 d->dr_name_len = calc_de_name_len; /* use ours */
284 d->dr_rec_len = GFS2_DIRENT_SIZE(d->dr_name_len);
285 }
286 lgfs2_dirent_out(d, dent);
287 lgfs2_bmodified(bh);
288 return 0;
289 }
290
291 /**
292 * dirblk_truncate - truncate a directory block
293 */
294 static void dirblk_truncate(struct lgfs2_inode *ip, struct gfs2_dirent *fixb,
295 struct lgfs2_buffer_head *bh)
296 {
297 char *bh_end;
298 struct lgfs2_dirent d;
299
300 bh_end = bh->b_data + ip->i_sbd->sd_bsize;
301 /* truncate the block to save the most dentries. To do this we
302 have to patch the previous dent. */
303 lgfs2_dirent_in(&d, fixb);
304 d.dr_rec_len = bh_end - (char *)fixb;
305 lgfs2_dirent_out(&d, fixb);
306 lgfs2_bmodified(bh);
307 }
308
309 /*
310 * check_entries - check directory entries for a given block
311 *
312 * @ip - dinode associated with this leaf block
313 * bh - buffer for the leaf block
314 * type - type of block this is (linear or exhash)
315 * @count - set to the count entries
316 * @lindex - the last inde
317 * @pass - structure pointing to pass-specific functions
318 *
319 * returns: 0 - good block or it was repaired to be good
320 * -1 - error occurred
321 */
322 static int check_entries(struct fsck_cx *cx, struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
323 int type, uint32_t *count, int lindex,
324 struct metawalk_fxns *pass)
325 {
326 struct gfs2_dirent *dent, *prev;
327 struct lgfs2_dirent d;
328 int error = 0;
329 char *bh_end;
330 char *filename;
331 int first = 1;
332
333 bh_end = bh->b_data + ip->i_sbd->sd_bsize;
334
335 if (type == DIR_LINEAR) {
336 dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_dinode));
337 } else {
338 dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_leaf));
339 log_debug(_("Checking leaf %"PRIu64" (0x%"PRIx64")\n"),
340 bh->b_blocknr, bh->b_blocknr);
341 }
342
343 prev = NULL;
344 if (!pass->check_dentry)
345 return 0;
346
347 while (1) {
348 if (skip_this_pass || fsck_abort)
349 return FSCK_OK;
350 lgfs2_dirent_in(&d, dent);
351 filename = (char *)dent + sizeof(struct gfs2_dirent);
352
353 if (d.dr_rec_len < sizeof(struct gfs2_dirent) +
354 d.dr_name_len ||
355 (d.dr_inum.in_formal_ino && !d.dr_name_len && !first)) {
356 log_err(_("Directory block %"PRIu64" (0x%"PRIx64"), "
357 "entry %d of directory %"PRIu64" (0x%"PRIx64") "
358 "is corrupt.\n"),
359 bh->b_blocknr, bh->b_blocknr, (*count) + 1,
360 ip->i_num.in_addr, ip->i_num.in_addr);
361 if (query(cx, _("Attempt to repair it? (y/n) "))) {
362 if (dirent_repair(ip, bh, &d, dent, type,
363 first)) {
364 if (first) /* make a new sentinel */
365 dirblk_truncate(ip, dent, bh);
366 else
367 dirblk_truncate(ip, prev, bh);
368 log_err( _("Unable to repair corrupt "
369 "directory entry; the "
370 "entry was removed "
371 "instead.\n"));
372 return 0;
373 } else {
374 log_err( _("Corrupt directory entry "
375 "repaired.\n"));
376 /* keep looping through dentries */
377 }
378 } else {
379 log_err( _("Corrupt directory entry ignored, "
380 "stopped after checking %d entries.\n"),
381 *count);
382 return 0;
383 }
384 }
385 if (!d.dr_inum.in_formal_ino) {
386 if (first) {
387 log_debug( _("First dirent is a sentinel (place holder).\n"));
388 first = 0;
389 } else {
390 log_err(_("Directory entry with inode number of "
391 "zero in leaf %"PRIu64" (0x%"PRIx64") of "
392 "directory %"PRIu64" (0x%"PRIx64")!\n"),
393 bh->b_blocknr, bh->b_blocknr,
394 ip->i_num.in_addr, ip->i_num.in_addr);
395 if (query(cx, _("Attempt to remove it? (y/n) "))) {
396 dirblk_truncate(ip, prev, bh);
397 log_err(_("The corrupt directory "
398 "entry was removed.\n"));
399 } else {
400 log_err( _("Corrupt directory entry "
401 "ignored, stopped after "
402 "checking %d entries.\n"),
403 *count);
404 }
405 return 0;
406 }
407 } else {
408 if (!d.dr_inum.in_addr && first) { /* reverse sentinel */
409 log_debug( _("First dirent is a Sentinel (place holder).\n"));
410 /* Swap the two to silently make it a proper sentinel */
411 d.dr_inum.in_addr = d.dr_inum.in_formal_ino;
412 d.dr_inum.in_formal_ino = 0;
413 lgfs2_dirent_out(&d, dent);
414 lgfs2_bmodified(bh);
415 /* Mark dirent buffer as modified */
416 first = 0;
417 } else {
418 error = pass->check_dentry(cx, ip, dent, prev, bh,
419 filename, count,
420 &lindex,
421 pass->private);
422 if (error < 0) {
423 stack;
424 return error;
425 }
426 }
427 }
428
429 if ((char *)dent + d.dr_rec_len >= bh_end){
430 log_debug(_("Last entry processed for %"PRIu64"->%"PRIu64
431 "(0x%"PRIx64"->0x%"PRIx64"), di_blocks=%"PRIu64".\n"),
432 ip->i_num.in_addr, bh->b_blocknr, ip->i_num.in_addr,
433 bh->b_blocknr, ip->i_blocks);
434 break;
435 }
436
437 /* If we didn't clear the dentry, or if we did, but it
438 * was the first dentry, set prev */
439 if (!error || first)
440 prev = dent;
441 first = 0;
442 dent = (struct gfs2_dirent *)((char *)dent + d.dr_rec_len);
443 }
444 return 0;
445 }
446
447 /**
448 * check_leaf - check a leaf block for errors
449 * Reads in the leaf block
450 * Leaves the buffer around for further analysis (caller must lgfs2_brelse)
451 */
452 int check_leaf(struct fsck_cx *cx, struct lgfs2_inode *ip, int lindex, struct metawalk_fxns *pass,
453 uint64_t *leaf_no, struct lgfs2_leaf *leaf, int *ref_count)
454 {
455 int error = 0, fix;
456 struct lgfs2_buffer_head *lbh = NULL;
457 struct gfs2_leaf *lfp;
458 uint32_t count = 0;
459 struct lgfs2_sbd *sdp = ip->i_sbd;
460 const char *msg;
461 int di_depth = ip->i_depth;
462
463 /* Make sure the block number is in range. */
464 if (!valid_block_ip(ip, *leaf_no)) {
465 log_err( _("Leaf block #%"PRIu64" (0x%"PRIx64") is out of range for "
466 "directory #%"PRIu64" (0x%"PRIx64") at index %d (0x%x).\n"),
467 *leaf_no, *leaf_no, ip->i_num.in_addr, ip->i_num.in_addr,
468 lindex, lindex);
469 msg = _("that is out of range");
470 goto bad_leaf;
471 }
472
473 /* Try to read in the leaf block. */
474 lbh = lgfs2_bread(sdp, *leaf_no);
475 /* Make sure it's really a valid leaf block. */
476 if (lgfs2_check_meta(lbh->b_data, GFS2_METATYPE_LF)) {
477 msg = _("that is not really a leaf");
478 goto bad_leaf;
479 }
480 if (pass->check_leaf_depth)
481 error = pass->check_leaf_depth(cx, ip, *leaf_no, *ref_count, lbh);
482
483 if (error >= 0 && pass->check_leaf) {
484 error = pass->check_leaf(cx, ip, *leaf_no, pass->private);
485 if (error == -EEXIST) {
486 log_info(_("Previous reference to leaf %"PRIu64" (0x%"PRIx64") "
487 "has already checked it; skipping.\n"),
488 *leaf_no, *leaf_no);
489 lgfs2_brelse(lbh);
490 return error;
491 }
492 }
493 /* Early versions of GFS2 had an endianess bug in the kernel that set
494 lf_dirent_format to cpu_to_be16(GFS2_FORMAT_DE). This was fixed
495 to use cpu_to_be32(), but we should check for incorrect values and
496 replace them with the correct value. */
497
498 lgfs2_leaf_in(leaf, lbh->b_data);
499 if (leaf->lf_dirent_format == (GFS2_FORMAT_DE << 16)) {
500 log_debug( _("incorrect lf_dirent_format at leaf #%" PRIu64
501 "\n"), *leaf_no);
502 leaf->lf_dirent_format = GFS2_FORMAT_DE;
503 lgfs2_leaf_out(leaf, lbh->b_data);
504 lgfs2_bmodified(lbh);
505 log_debug( _("Fixing lf_dirent_format.\n"));
506 }
507
508 lfp = (struct gfs2_leaf *)lbh->b_data;
509 /* Make sure it's really a leaf. */
510 if (be32_to_cpu(lfp->lf_header.mh_type) != GFS2_METATYPE_LF) {
511 log_err(_("Inode %"PRIu64" (0x%"PRIx64") points to bad leaf %"PRIu64
512 " (0x%"PRIx64").\n"),
513 ip->i_num.in_addr, ip->i_num.in_addr, *leaf_no, *leaf_no);
514 msg = _("that is not a leaf");
515 goto bad_leaf;
516 }
517
518 if (pass->check_dentry && is_dir(ip)) {
519 error = check_entries(cx, ip, lbh, DIR_EXHASH, &count, lindex,
520 pass);
521
522 if (skip_this_pass || fsck_abort)
523 goto out;
524
525 if (error < 0) {
526 stack;
527 goto out; /* This seems wrong: needs investigation */
528 }
529
530 if (count == leaf->lf_entries)
531 goto out;
532
533 /* release and re-read the leaf in case check_entries
534 changed it. */
535 lgfs2_brelse(lbh);
536 lbh = lgfs2_bread(sdp, *leaf_no);
537 lgfs2_leaf_in(leaf, lbh->b_data);
538 if (count != leaf->lf_entries) {
539 log_err(_("Leaf %"PRIu64" (0x%"PRIx64") entry count in "
540 "directory %"PRIu64" (0x%"PRIx64") does not match "
541 "number of entries found - is %u, found %u\n"),
542 *leaf_no, *leaf_no, ip->i_num.in_addr, ip->i_num.in_addr,
543 leaf->lf_entries, count);
544 if (query(cx, _("Update leaf entry count? (y/n) "))) {
545 leaf->lf_entries = count;
546 lgfs2_leaf_out(leaf, lbh->b_data);
547 lgfs2_bmodified(lbh);
548 log_warn( _("Leaf entry count updated\n"));
549 } else
550 log_err( _("Leaf entry count left in "
551 "inconsistent state\n"));
552 }
553 }
554 out:
555 if (di_depth < ip->i_depth) {
556 log_debug(_("Depth of directory %"PRIu64" (0x%"PRIx64") changed from "
557 "%d to %d; adjusting ref_count from %d to %d\n"),
558 ip->i_num.in_addr, ip->i_num.in_addr, di_depth, ip->i_depth,
559 *ref_count, (*ref_count) << (ip->i_depth - di_depth));
560 (*ref_count) <<= (ip->i_depth - di_depth);
561 }
562 lgfs2_brelse(lbh);
563 if (error < 0)
564 return error;
565 return 0;
566
567 bad_leaf:
568 if (lbh)
569 lgfs2_brelse(lbh);
570 if (pass->repair_leaf) {
571 /* The leaf we read in is bad so we need to repair it. */
572 fix = pass->repair_leaf(cx, ip, leaf_no, lindex, *ref_count, msg);
573 if (fix < 0)
574 return fix;
575
576 }
577 if (di_depth < ip->i_depth) {
578 log_debug(_("Depth of directory %"PRIu64" (0x%"PRIx64") changed from "
579 "%d to %d. Adjusting ref_count from %d to %d\n"),
580 ip->i_num.in_addr, ip->i_num.in_addr, di_depth, ip->i_depth,
581 *ref_count, (*ref_count) << (ip->i_depth - di_depth));
582 (*ref_count) <<= (ip->i_depth - di_depth);
583 }
584 return 1;
585 }
586
587 static int u64cmp(const void *p1, const void *p2)
588 {
589 uint64_t a = *(uint64_t *)p1;
590 uint64_t b = *(uint64_t *)p2;
591
592 if (a > b)
593 return 1;
594 if (a < b)
595 return -1;
596
597 return 0;
598 }
599
600 static void dir_leaf_reada(struct lgfs2_inode *ip, __be64 *tbl, unsigned hsize)
601 {
602 uint64_t *t = alloca(hsize * sizeof(uint64_t));
603 uint64_t leaf_no;
604 struct lgfs2_sbd *sdp = ip->i_sbd;
605 unsigned n = 0;
606 unsigned i;
607
608 for (i = 0; i < hsize; i++) {
609 leaf_no = be64_to_cpu(tbl[i]);
610 if (valid_block_ip(ip, leaf_no))
611 t[n++] = leaf_no * sdp->sd_bsize;
612 }
613 qsort(t, n, sizeof(uint64_t), u64cmp);
614 for (i = 0; i < n; i++)
615 (void)posix_fadvise(sdp->device_fd, t[i], sdp->sd_bsize, POSIX_FADV_WILLNEED);
616 }
617
618 /* Checks exhash directory entries */
619 int check_leaf_blks(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
620 {
621 int error = 0;
622 unsigned hsize = (1 << ip->i_depth);
623 uint64_t leaf_no, leaf_next;
624 uint64_t first_ok_leaf, orig_di_blocks;
625 struct lgfs2_buffer_head *lbh;
626 int lindex;
627 struct lgfs2_sbd *sdp = ip->i_sbd;
628 int ref_count, orig_ref_count, orig_di_depth, orig_di_height;
629 __be64 *tbl;
630 int chained_leaf, tbl_valid;
631
632 tbl = get_dir_hash(ip);
633 if (tbl == NULL) {
634 perror("get_dir_hash");
635 return -1;
636 }
637 tbl_valid = 1;
638 orig_di_depth = ip->i_depth;
639 orig_di_height = ip->i_height;
640 orig_di_blocks = ip->i_blocks;
641
642 /* Turn off system readahead */
643 (void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_RANDOM);
644
645 /* Readahead */
646 dir_leaf_reada(ip, tbl, hsize);
647
648 if (pass->check_hash_tbl) {
649 error = pass->check_hash_tbl(cx, ip, tbl, hsize, pass->private);
650 if (error < 0) {
651 free(tbl);
652 (void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
653 return error;
654 }
655 /* If hash table changes were made, read it in again. */
656 if (error) {
657 free(tbl);
658 tbl = get_dir_hash(ip);
659 if (tbl == NULL) {
660 perror("get_dir_hash");
661 return -1;
662 }
663 }
664 }
665
666 /* Find the first valid leaf pointer in range and use it as our "old"
667 leaf. That way, bad blocks at the beginning will be overwritten
668 with the first valid leaf. */
669 first_ok_leaf = leaf_no = -1;
670 for (lindex = 0; lindex < hsize; lindex++) {
671 leaf_no = be64_to_cpu(tbl[lindex]);
672 if (valid_block_ip(ip, leaf_no)) {
673 lbh = lgfs2_bread(sdp, leaf_no);
674 /* Make sure it's really a valid leaf block. */
675 if (lgfs2_check_meta(lbh->b_data, GFS2_METATYPE_LF) == 0) {
676 lgfs2_brelse(lbh);
677 first_ok_leaf = leaf_no;
678 break;
679 }
680 lgfs2_brelse(lbh);
681 }
682 }
683 if (first_ok_leaf == -1) { /* no valid leaf found */
684 log_err(_("Directory #%"PRIu64" (0x%"PRIx64") has no valid leaf blocks\n"),
685 ip->i_num.in_addr, ip->i_num.in_addr);
686 free(tbl);
687 (void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
688 return 1;
689 }
690 lindex = 0;
691 leaf_next = -1;
692 while (lindex < hsize) {
693 int l;
694
695 if (fsck_abort)
696 break;
697
698 if (!tbl_valid) {
699 free(tbl);
700 log_debug(_("Re-reading 0x%"PRIx64" hash table.\n"), ip->i_num.in_addr);
701 tbl = get_dir_hash(ip);
702 if (tbl == NULL) {
703 perror("get_dir_hash");
704 return -1;
705 }
706 tbl_valid = 1;
707 orig_di_depth = ip->i_depth;
708 orig_di_height = ip->i_height;
709 orig_di_blocks = ip->i_blocks;
710 }
711 leaf_no = be64_to_cpu(tbl[lindex]);
712
713 /* count the number of block pointers to this leaf. We don't
714 need to count the current lindex, because we already know
715 it's a reference */
716 ref_count = 1;
717
718 for (l = lindex + 1; l < hsize; l++) {
719 leaf_next = be64_to_cpu(tbl[l]);
720 if (leaf_next != leaf_no)
721 break;
722 ref_count++;
723 }
724 orig_ref_count = ref_count;
725
726 chained_leaf = 0;
727 do {
728 struct lgfs2_leaf leaf;
729 if (fsck_abort) {
730 free(tbl);
731 (void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
732 return 0;
733 }
734 error = check_leaf(cx, ip, lindex, pass, &leaf_no, &leaf,
735 &ref_count);
736 if (ref_count != orig_ref_count) {
737 log_debug(_("Ref count of leaf 0x%"PRIx64
738 " changed from %d to %d.\n"),
739 leaf_no, orig_ref_count, ref_count);
740 tbl_valid = 0;
741 }
742 if (error < 0) {
743 free(tbl);
744 return error;
745 }
746 if (!leaf.lf_next || error)
747 break;
748 leaf_no = leaf.lf_next;
749 chained_leaf++;
750 log_debug(_("Leaf chain #%d (0x%"PRIx64") detected.\n"),
751 chained_leaf, leaf_no);
752 } while (1); /* while we have chained leaf blocks */
753 if (orig_di_depth != ip->i_depth) {
754 log_debug(_("Depth of 0x%"PRIx64" changed from %d to %d\n"),
755 ip->i_num.in_addr, orig_di_depth, ip->i_depth);
756 tbl_valid = 0;
757 lindex <<= (ip->i_depth - orig_di_depth);
758 hsize = (1 << ip->i_depth);
759 }
760 if (orig_di_height != ip->i_height) {
761 log_debug(_("Height of 0x%"PRIx64" changed from %d to %d\n"),
762 ip->i_num.in_addr, orig_di_height, ip->i_height);
763 tbl_valid = 0;
764 }
765 if (orig_di_blocks != ip->i_blocks) {
766 log_debug(_("Block count of 0x%"PRIx64" changed from %"PRIu64" to %"PRIu64"\n"),
767 ip->i_num.in_addr, orig_di_blocks, ip->i_blocks);
768 tbl_valid = 0;
769 }
770 lindex += ref_count;
771 } /* for every leaf block */
772 free(tbl);
773 (void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
774 return 0;
775 }
776
777 static int check_eattr_entries(struct fsck_cx *cx, struct lgfs2_inode *ip,
778 struct lgfs2_buffer_head *bh,
779 struct metawalk_fxns *pass)
780 {
781 struct gfs2_ea_header *ea_hdr, *ea_hdr_prev = NULL;
782 __be64 *ea_data_ptr = NULL;
783 int i;
784 int error = 0, err;
785 uint32_t offset = (uint32_t)sizeof(struct gfs2_meta_header);
786 uint32_t offset_limit = ip->i_sbd->sd_bsize - sizeof(struct gfs2_ea_header);
787
788 if (!pass->check_eattr_entry)
789 return 0;
790
791 ea_hdr = (struct gfs2_ea_header *)(bh->b_data +
792 sizeof(struct gfs2_meta_header));
793
794 while (1){
795 if (ea_hdr->ea_type == GFS2_EATYPE_UNUSED)
796 error = 0;
797 else
798 error = pass->check_eattr_entry(cx, ip, bh, ea_hdr,
799 ea_hdr_prev,
800 pass->private);
801 if (error < 0) {
802 stack;
803 return -1;
804 }
805 if (error == 0 && pass->check_eattr_extentry &&
806 ea_hdr->ea_num_ptrs) {
807 uint32_t tot_ealen = 0;
808 struct lgfs2_sbd *sdp = ip->i_sbd;
809
810 ea_data_ptr = ((__be64 *)((char *)ea_hdr +
811 sizeof(struct gfs2_ea_header) +
812 ((ea_hdr->ea_name_len + 7) & ~7)));
813
814 /* It is possible when a EA is shrunk
815 ** to have ea_num_ptrs be greater than
816 ** the number required for ** data.
817 ** In this case, the EA ** code leaves
818 ** the blocks ** there for **
819 ** reuse........... */
820
821 for(i = 0; i < ea_hdr->ea_num_ptrs; i++){
822 err = pass->check_eattr_extentry(cx, ip, i,
823 ea_data_ptr, bh, tot_ealen,
824 ea_hdr, ea_hdr_prev,
825 pass->private);
826 if (err)
827 error = err;
828 tot_ealen += sdp->sd_bsize -
829 sizeof(struct gfs2_meta_header);
830 ea_data_ptr++;
831 }
832 }
833 offset += be32_to_cpu(ea_hdr->ea_rec_len);
834 if (ea_hdr->ea_flags & GFS2_EAFLAG_LAST ||
835 offset > offset_limit || ea_hdr->ea_rec_len == 0) {
836 break;
837 }
838 ea_hdr_prev = ea_hdr;
839 ea_hdr = (struct gfs2_ea_header *)
840 ((char *)(ea_hdr) +
841 be32_to_cpu(ea_hdr->ea_rec_len));
842 }
843
844 return error;
845 }
846
847 /**
848 * check_leaf_eattr
849 * @ip: the inode the eattr comes from
850 * @block: block number of the leaf
851 *
852 * Returns: 0 on success, 1 if removal is needed, -1 on error
853 */
854 static int check_leaf_eattr(struct fsck_cx *cx, struct lgfs2_inode *ip, uint64_t block,
855 uint64_t parent, struct metawalk_fxns *pass)
856 {
857 struct lgfs2_buffer_head *bh = NULL;
858
859 if (pass->check_eattr_leaf) {
860 int error = 0;
861
862 log_debug(_("Checking EA leaf block #%"PRIu64" (0x%"PRIx64") for "
863 "inode #%"PRIu64" (0x%"PRIx64").\n"),
864 block, block, ip->i_num.in_addr, ip->i_num.in_addr);
865
866 error = pass->check_eattr_leaf(cx, ip, block, parent, &bh,
867 pass->private);
868 if (error < 0) {
869 stack;
870 return -1;
871 }
872 if (error > 0) {
873 if (bh)
874 lgfs2_brelse(bh);
875 return 1;
876 }
877 if (bh) {
878 error = check_eattr_entries(cx, ip, bh, pass);
879 lgfs2_brelse(bh);
880 }
881 return error;
882 }
883
884 return 0;
885 }
886
887 /**
888 * check_indirect_eattr
889 * @ip: the inode the eattr comes from
890 * @indirect_block
891 *
892 * Returns: 0 on success -1 on error
893 */
894 static int check_indirect_eattr(struct fsck_cx *cx, struct lgfs2_inode *ip, uint64_t indirect,
895 struct lgfs2_buffer_head *indirect_buf,
896 struct metawalk_fxns *pass)
897 {
898 int error = 0, err;
899 __be64 *ea_leaf_ptr, *end;
900 uint64_t block;
901 struct lgfs2_sbd *sdp = ip->i_sbd;
902 int first_ea_is_bad = 0;
903 uint64_t di_eattr_save = ip->i_eattr;
904 uint64_t offset = sizeof(struct gfs2_meta_header);
905 int leaf_pointers = 0, leaf_pointer_errors = 0;
906
907 ea_leaf_ptr = (__be64 *)(indirect_buf->b_data + offset);
908 end = ea_leaf_ptr + ((sdp->sd_bsize - offset) / 8);
909
910 while (*ea_leaf_ptr && (ea_leaf_ptr < end)){
911 block = be64_to_cpu(*ea_leaf_ptr);
912 leaf_pointers++;
913 err = check_leaf_eattr(cx, ip, block, indirect, pass);
914 if (err) {
915 error = err;
916 log_err(_("Error detected in leaf block %"PRIu64" (0x%"PRIx64") "
917 "referenced by indirect block %"PRIu64" (0x%"PRIx64").\n"),
918 block, block, indirect, indirect);
919 log_err(_("Subsequent leaf block pointers should be "
920 "cleared.\n"));
921 }
922 if (error) { /* leaf blocks following an error must also be
923 treated as error blocks and cleared. */
924 leaf_pointer_errors++;
925 log_err(_("Pointer to EA leaf block %"PRIu64" (0x%"PRIx64") in "
926 "indirect block %"PRIu64" (0x%"PRIx64") should be cleared.\n"),
927 block, block, indirect, indirect);
928 }
929 /* If the first eattr lead is bad, we can't have a hole, so we
930 have to treat this as an unrecoverable eattr error and
931 delete all eattr info. Calling finish_eattr_indir here
932 causes ip->i_di.di_eattr = 0 and that ensures that
933 subsequent calls to check_leaf_eattr result in the eattr
934 check_leaf_block nuking them all "due to previous errors" */
935 if (leaf_pointers == 1 && leaf_pointer_errors == 1) {
936 first_ea_is_bad = 1;
937 if (pass->finish_eattr_indir)
938 pass->finish_eattr_indir(cx, ip, leaf_pointers,
939 leaf_pointer_errors,
940 pass->private);
941 } else if (leaf_pointer_errors) {
942 /* This is a bit tricky. We can't have eattr holes.
943 So if we have 4 good eattrs, 1 bad eattr and 5 more
944 good ones: GGGGBGGGGG, we need to tell
945 check_leaf_eattr to delete all eattrs after the bad
946 one. So we want: GGGG when we finish. To do that,
947 we set di_eattr to 0 temporarily. */
948 ip->i_eattr = 0;
949 lgfs2_bmodified(ip->i_bh);
950 }
951 ea_leaf_ptr++;
952 }
953 /* If we temporarily nuked the ea block to prevent checking past
954 a corrupt ea leaf, we need to restore the saved di_eattr block. */
955 if (di_eattr_save != 0)
956 ip->i_eattr = di_eattr_save;
957 if (pass->finish_eattr_indir) {
958 if (!first_ea_is_bad) {
959 pass->finish_eattr_indir(cx, ip, leaf_pointers,
960 leaf_pointer_errors,
961 pass->private);
962 }
963 if (pass->delete_block && leaf_pointer_errors &&
964 leaf_pointer_errors == leaf_pointers) {
965 pass->delete_block(cx, ip, indirect, NULL, "leaf", NULL);
966 error = 1;
967 }
968 }
969
970 return error;
971 }
972
973 /**
974 * check_inode_eattr - check the EA's for a single inode
975 * @ip: the inode whose EA to check
976 *
977 * Returns: 0 on success, -1 on error
978 */
979 int check_inode_eattr(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
980 {
981 int error = 0;
982 struct lgfs2_buffer_head *indirect_buf = NULL;
983
984 if (!ip->i_eattr)
985 return 0;
986
987 if (ip->i_flags & GFS2_DIF_EA_INDIRECT){
988 if (!pass->check_eattr_indir)
989 return 0;
990
991 log_debug(_("Checking EA indirect block #%"PRIu64" (0x%"PRIx64") for "
992 "inode #%"PRIu64" (0x%"PRIx64")..\n"),
993 ip->i_eattr, ip->i_eattr, ip->i_num.in_addr, ip->i_num.in_addr);
994 error = pass->check_eattr_indir(cx, ip, ip->i_eattr, ip->i_num.in_addr,
995 &indirect_buf, pass->private);
996 if (!error) {
997 error = check_indirect_eattr(cx, ip, ip->i_eattr,
998 indirect_buf, pass);
999 if (error)
1000 stack;
1001 }
1002 if (indirect_buf)
1003 lgfs2_brelse(indirect_buf);
1004 return error;
1005 }
1006 error = check_leaf_eattr(cx, ip, ip->i_eattr, ip->i_num.in_addr, pass);
1007 if (error)
1008 stack;
1009
1010 return error;
1011 }
1012
1013 /**
1014 * free_metalist - free all metadata on a multi-level metadata list
1015 */
1016 static void free_metalist(struct lgfs2_inode *ip, osi_list_t *mlp)
1017 {
1018 unsigned int height = ip->i_height;
1019 unsigned int i;
1020 struct lgfs2_buffer_head *nbh;
1021
1022 for (i = 0; i <= height; i++) {
1023 osi_list_t *list;
1024
1025 list = &mlp[i];
1026 while (!osi_list_empty(list)) {
1027 nbh = osi_list_entry(list->next,
1028 struct lgfs2_buffer_head, b_altlist);
1029 if (nbh == ip->i_bh)
1030 osi_list_del_init(&nbh->b_altlist);
1031 else
1032 lgfs2_brelse(nbh);
1033 }
1034 }
1035 }
1036
1037 static void file_ra(struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
1038 int head_size, int maxptrs, int h)
1039 {
1040 struct lgfs2_sbd *sdp = ip->i_sbd;
1041 uint64_t sblock = 0, block;
1042 int extlen = 0;
1043 __be64 *p;
1044
1045 if (h + 2 == ip->i_height) {
1046 p = (__be64 *)(bh->b_data + head_size);
1047 if (*p && *(p + 1)) {
1048 sblock = be64_to_cpu(*p);
1049 p++;
1050 block = be64_to_cpu(*p);
1051 extlen = block - sblock;
1052 if (extlen > 1 && extlen <= maxptrs) {
1053 (void)posix_fadvise(sdp->device_fd,
1054 sblock * sdp->sd_bsize,
1055 (extlen + 1) * sdp->sd_bsize,
1056 POSIX_FADV_WILLNEED);
1057 return;
1058 }
1059 }
1060 extlen = 0;
1061 }
1062 for (p = (__be64 *)(bh->b_data + head_size);
1063 p < (__be64 *)(bh->b_data + sdp->sd_bsize); p++) {
1064 if (*p) {
1065 if (!sblock) {
1066 sblock = be64_to_cpu(*p);
1067 extlen = 1;
1068 continue;
1069 }
1070 block = be64_to_cpu(*p);
1071 if (block == sblock + extlen) {
1072 extlen++;
1073 continue;
1074 }
1075 }
1076 if (extlen && sblock) {
1077 if (extlen > 1)
1078 extlen--;
1079 (void)posix_fadvise(sdp->device_fd, sblock * sdp->sd_bsize,
1080 extlen * sdp->sd_bsize,
1081 POSIX_FADV_WILLNEED);
1082 extlen = 0;
1083 p--;
1084 }
1085 }
1086 if (extlen)
1087 (void)posix_fadvise(sdp->device_fd, sblock * sdp->sd_bsize,
1088 extlen * sdp->sd_bsize, POSIX_FADV_WILLNEED);
1089 }
1090
1091 static int do_check_metalist(struct fsck_cx *cx, struct iptr iptr, int height, struct lgfs2_buffer_head **bhp,
1092 struct metawalk_fxns *pass)
1093 {
1094 struct lgfs2_inode *ip = iptr.ipt_ip;
1095 uint64_t block = iptr_block(iptr);
1096 int was_duplicate = 0;
1097 int is_valid = 1;
1098 int error;
1099
1100 if (pass->check_metalist == NULL)
1101 return 0;
1102
1103 error = pass->check_metalist(cx, iptr, bhp, height, &is_valid,
1104 &was_duplicate, pass->private);
1105 if (error == META_ERROR) {
1106 stack;
1107 log_info("\n");
1108 log_info(_("Serious metadata error on block %"PRIu64" (0x%"PRIx64").\n"),
1109 block, block);
1110 return error;
1111 }
1112 if (error == META_SKIP_FURTHER) {
1113 log_info("\n");
1114 log_info(_("Unrecoverable metadata error on block %"PRIu64" (0x%"PRIx64")\n"),
1115 block, block);
1116 log_info(_("Further metadata will be skipped.\n"));
1117 return error;
1118 }
1119 if (!is_valid) {
1120 log_debug("Skipping rejected block %"PRIu64" (0x%"PRIx64")\n", block, block);
1121 if (pass->invalid_meta_is_fatal)
1122 return META_ERROR;
1123 return META_SKIP_ONE;
1124 }
1125 if (was_duplicate) {
1126 log_debug("Skipping duplicate %"PRIu64" (0x%"PRIx64")\n", block, block);
1127 return META_SKIP_ONE;
1128 }
1129 if (!valid_block_ip(ip, block)) {
1130 log_debug("Skipping invalid block %"PRIu64" (0x%"PRIx64")\n", block, block);
1131 if (pass->invalid_meta_is_fatal)
1132 return META_ERROR;
1133 return META_SKIP_ONE;
1134 }
1135 return error;
1136 }
1137
1138 /**
1139 * build_and_check_metalist - check a bunch of indirect blocks
1140 * This includes hash table blocks for directories
1141 * which are technically "data" in the bitmap.
1142 *
1143 * Returns: 0 - all is well, process the blocks this metadata references
1144 * 1 - something went wrong, but process the sub-blocks anyway
1145 * -1 - something went wrong, so don't process the sub-blocks
1146 * @ip:
1147 * @mlp:
1148 */
1149 static int build_and_check_metalist(struct fsck_cx *cx, struct lgfs2_inode *ip, osi_list_t *mlp,
1150 struct metawalk_fxns *pass)
1151 {
1152 uint32_t height = ip->i_height;
1153 struct lgfs2_buffer_head *metabh = ip->i_bh;
1154 osi_list_t *prev_list, *cur_list, *tmp;
1155 struct iptr iptr = { .ipt_ip = ip, NULL, 0};
1156 int h, head_size, iblk_type;
1157 __be64 *undoptr;
1158 int maxptrs;
1159 int error;
1160
1161 osi_list_add(&metabh->b_altlist, &mlp[0]);
1162
1163 /* Directories are special. Their 'data' is the hash table, which is
1164 basically an indirect block list. Their height is not important
1165 because it checks everything through the hash table using
1166 "depth" field calculations. However, we still have to check the
1167 indirect blocks, even if the height == 1. */
1168 if (is_dir(ip))
1169 height++;
1170
1171 /* if (<there are no indirect blocks to check>) */
1172 if (height < 2)
1173 return META_IS_GOOD;
1174 for (h = 1; h < height; h++) {
1175 if (h > 1) {
1176 if (is_dir(ip) &&
1177 h == ip->i_height + 1)
1178 iblk_type = GFS2_METATYPE_JD;
1179 else
1180 iblk_type = GFS2_METATYPE_IN;
1181 head_size = sizeof(struct gfs2_meta_header);
1182 maxptrs = ip->i_sbd->sd_inptrs;
1183 } else {
1184 iblk_type = GFS2_METATYPE_DI;
1185 head_size = sizeof(struct gfs2_dinode);
1186 maxptrs = ip->i_sbd->sd_diptrs;
1187 }
1188 prev_list = &mlp[h - 1];
1189 cur_list = &mlp[h];
1190
1191 for (tmp = prev_list->next; tmp != prev_list; tmp = tmp->next) {
1192 iptr.ipt_off = head_size;
1193 iptr.ipt_bh = osi_list_entry(tmp, struct lgfs2_buffer_head, b_altlist);
1194
1195 if (lgfs2_check_meta(iptr_buf(iptr), iblk_type)) {
1196 if (pass->invalid_meta_is_fatal)
1197 return META_ERROR;
1198
1199 continue;
1200 }
1201 if (pass->readahead)
1202 file_ra(ip, iptr.ipt_bh, head_size, maxptrs, h);
1203
1204 /* Now check the metadata itself */
1205 for (; iptr.ipt_off < ip->i_sbd->sd_bsize; iptr.ipt_off += sizeof(uint64_t)) {
1206 struct lgfs2_buffer_head *nbh = NULL;
1207
1208 if (skip_this_pass || fsck_abort)
1209 return META_IS_GOOD;
1210 if (!iptr_block(iptr))
1211 continue;
1212
1213 error = do_check_metalist(cx, iptr, h, &nbh, pass);
1214 if (error == META_ERROR || error == META_SKIP_FURTHER)
1215 goto error_undo;
1216 if (error == META_SKIP_ONE)
1217 continue;
1218 if (!nbh)
1219 nbh = lgfs2_bread(ip->i_sbd, iptr_block(iptr));
1220 osi_list_add_prev(&nbh->b_altlist, cur_list);
1221 } /* for all data on the indirect block */
1222 } /* for blocks at that height */
1223 } /* for height */
1224 return 0;
1225
1226 error_undo: /* undo what we've done so far for this block */
1227 if (pass->undo_check_meta == NULL)
1228 return error;
1229
1230 log_info(_("Undoing the work we did before the error on block %"PRIu64" (0x%"PRIx64").\n"),
1231 iptr.ipt_bh->b_blocknr, iptr.ipt_bh->b_blocknr);
1232 for (undoptr = (__be64 *)(iptr_buf(iptr) + head_size);
1233 undoptr < iptr_ptr(iptr) && undoptr < iptr_endptr(iptr);
1234 undoptr++) {
1235 uint64_t block = be64_to_cpu(*undoptr);
1236
1237 if (block == 0)
1238 continue;
1239
1240 pass->undo_check_meta(cx, ip, block, h, pass->private);
1241 }
1242 return error;
1243 }
1244
1245 static unsigned int hdr_size(struct lgfs2_buffer_head *bh, unsigned int height)
1246 {
1247 if (height > 1)
1248 return sizeof(struct gfs2_meta_header);
1249 return sizeof(struct gfs2_dinode);
1250 }
1251
1252 struct error_block {
1253 uint64_t metablk; /* metadata block where error was found */
1254 int metaoff; /* offset in that metadata block where error found */
1255 uint64_t errblk; /* error block */
1256 };
1257
1258 static void report_data_error(uint64_t metablock, int offset, uint64_t block,
1259 struct error_block *error_blk,
1260 int rc, int error)
1261 {
1262 log_info("\n");
1263 if (rc < 0) {
1264 /* A fatal error trumps a non-fatal one. */
1265 if ((error_blk->errblk == 0) ||
1266 (rc < error)) {
1267 log_debug(_("Fatal error on metadata "
1268 "block 0x%"PRIx64", "
1269 "offset 0x%x, referencing data "
1270 "block 0x%"PRIx64" "
1271 "preempts non-fatal error on "
1272 "block 0x%"PRIx64"\n"),
1273 metablock,
1274 offset,
1275 block,
1276 error_blk->errblk);
1277 error_blk->metablk = metablock;
1278 error_blk->metaoff = offset;
1279 error_blk->errblk = block;
1280 }
1281 log_info(_("Unrecoverable "));
1282 } else { /* nonfatal error */
1283 if (error_blk->errblk == 0) {
1284 error_blk->metablk = metablock;
1285 error_blk->metaoff = offset;
1286 error_blk->errblk = block;
1287 }
1288 }
1289 log_info(_("data block error %d on metadata "
1290 "block %"PRId64" (0x%"PRIx64"), "
1291 "offset %d (0x%x), referencing "
1292 "data block %"PRId64" (0x%"PRIx64").\n"),
1293 rc,
1294 metablock, metablock,
1295 offset, offset,
1296 block, block);
1297 }
1298
1299 /**
1300 * check_data - check all data pointers for a given buffer
1301 * This does not include "data" blocks that are really
1302 * hash table blocks for directories.
1303 *
1304 * @ip:
1305 *
1306 * returns: +ENOENT if there are too many bad pointers
1307 * -1 if a more serious error occurred.
1308 * 0 if no errors occurred
1309 * 1 if errors were found and corrected
1310 * 2 (ENOENT) is there were too many bad pointers
1311 */
1312 static int metawalk_check_data(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass,
1313 struct lgfs2_buffer_head *bh, unsigned int height,
1314 uint64_t *blks_checked, struct error_block *error_blk)
1315 {
1316 int error = 0, rc = 0;
1317 uint64_t block;
1318 __be64 *ptr_start = (__be64 *)(bh->b_data + hdr_size(bh, height));
1319 __be64 *ptr_end = (__be64 *)(bh->b_data + ip->i_sbd->sd_bsize);
1320 __be64 *ptr;
1321 uint64_t metablock = bh->b_blocknr;
1322
1323 /* If there isn't much pointer corruption check the pointers */
1324 log_debug("Processing data blocks for inode 0x%"PRIx64", metadata block 0x%"PRIx64".\n",
1325 ip->i_num.in_addr, metablock);
1326 for (ptr = ptr_start ; ptr < ptr_end && !fsck_abort; ptr++) {
1327 if (!*ptr)
1328 continue;
1329
1330 if (skip_this_pass || fsck_abort)
1331 return error;
1332 block = be64_to_cpu(*ptr);
1333 /* It's important that we don't call valid_block() and
1334 bypass calling check_data on invalid blocks because that
1335 would defeat the rangecheck_block related functions in
1336 pass1. Therefore the individual check_data functions
1337 should do a range check. */
1338 rc = pass->check_data(cx, ip, metablock, block, pass->private,
1339 bh, ptr);
1340 if (rc && (!error || (rc < error))) {
1341 report_data_error(metablock, (char *)ptr - bh->b_data, block, error_blk, rc, error);
1342 error = rc;
1343 }
1344 if (rc < 0)
1345 return rc;
1346 (*blks_checked)++;
1347 }
1348 return error;
1349 }
1350
1351 static int report_undo_data_error(uint64_t metablock, int offset, uint64_t block,
1352 struct error_block *error_blk,
1353 int *found_error_blk, int error)
1354 {
1355 if (metablock == error_blk->metablk &&
1356 offset == error_blk->metaoff &&
1357 block == error_blk->errblk) {
1358 if (error < 0) { /* A fatal error that stopped it? */
1359 log_debug(_("Stopping the undo process: "
1360 "fatal error block 0x%"PRIx64" was "
1361 "found at metadata block 0x%"PRIx64","
1362 "offset 0x%x.\n"),
1363 error_blk->errblk,
1364 error_blk->metablk,
1365 error_blk->metaoff);
1366 return 1;
1367 }
1368 *found_error_blk = 1;
1369 log_debug(_("The non-fatal error block 0x%"PRIx64" was "
1370 "found at metadata block 0x%"PRIx64", offset "
1371 "0x%d, but undo processing will continue "
1372 "until the end of this metadata block.\n"),
1373 error_blk->errblk,
1374 error_blk->metablk,
1375 error_blk->metaoff);
1376 }
1377 return 0;
1378 }
1379
1380 static int undo_check_data(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass,
1381 struct lgfs2_buffer_head *bh, unsigned int height,
1382 struct error_block *error_blk, int error)
1383 {
1384 __be64 *ptr_start = (__be64 *)(bh->b_data + hdr_size(bh, height));
1385 __be64 *ptr_end = (__be64 *)(bh->b_data + ip->i_sbd->sd_bsize);
1386 __be64 *ptr;
1387 uint64_t metablock = bh->b_blocknr;
1388 int rc = 0;
1389 uint64_t block;
1390 int found_error_blk = 0;
1391
1392 /* If there isn't much pointer corruption check the pointers */
1393 for (ptr = ptr_start ; ptr < ptr_end && !fsck_abort; ptr++) {
1394 if (!*ptr)
1395 continue;
1396
1397 if (skip_this_pass || fsck_abort)
1398 return 1;
1399 block = be64_to_cpu(*ptr);
1400 if (report_undo_data_error(metablock, (char *)ptr - bh->b_data,
1401 block, error_blk, &found_error_blk, error))
1402 return 1;
1403 rc = pass->undo_check_data(cx, ip, block, pass->private);
1404 if (rc < 0)
1405 return rc;
1406 }
1407 return found_error_blk;
1408 }
1409
1410 static unsigned int should_check(struct lgfs2_buffer_head *bh, unsigned int height)
1411 {
1412 int iblk_type = height > 1 ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
1413
1414 return lgfs2_check_meta(bh->b_data, iblk_type) == 0;
1415 }
1416
1417 /**
1418 * check_metatree
1419 * @ip: inode structure in memory
1420 * @pass: structure passed in from caller to determine the sub-functions
1421 *
1422 */
1423 int check_metatree(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
1424 {
1425 unsigned int height = ip->i_height;
1426 osi_list_t *metalist = alloca((height + 1) * sizeof(*metalist));
1427 osi_list_t *list, *tmp;
1428 struct lgfs2_buffer_head *bh;
1429 unsigned int i;
1430 uint64_t blks_checked = 0;
1431 int error, rc;
1432 int metadata_clean = 0;
1433 struct error_block error_blk = {0, 0, 0};
1434 int hit_error_blk = 0;
1435
1436 if (!height && !is_dir(ip))
1437 return 0;
1438
1439 /* metalist has one extra element for directories (see build_and_check_metalist). */
1440 for (i = 0; i <= height; i++)
1441 osi_list_init(&metalist[i]);
1442
1443 /* create and check the metadata list for each height */
1444 error = build_and_check_metalist(cx, ip, metalist, pass);
1445 if (error) {
1446 stack;
1447 goto undo_metalist;
1448 }
1449
1450 metadata_clean = 1;
1451 /* For directories, we've already checked the "data" blocks which
1452 * comprise the directory hash table, so we perform the directory
1453 * checks and exit. */
1454 if (is_dir(ip)) {
1455 if (!(ip->i_flags & GFS2_DIF_EXHASH))
1456 goto out;
1457 /* check validity of leaf blocks and leaf chains */
1458 error = check_leaf_blks(cx, ip, pass);
1459 if (error)
1460 goto undo_metalist;
1461 goto out;
1462 }
1463
1464 /* check data blocks */
1465 list = &metalist[height - 1];
1466 if (ip->i_blocks > COMFORTABLE_BLKS)
1467 last_reported_fblock = -10000000;
1468
1469 for (tmp = list->next; !error && tmp != list; tmp = tmp->next) {
1470 if (fsck_abort) {
1471 free_metalist(ip, metalist);
1472 return 0;
1473 }
1474 bh = osi_list_entry(tmp, struct lgfs2_buffer_head, b_altlist);
1475 if (!should_check(bh, height))
1476 continue;
1477
1478 if (pass->check_data)
1479 error = metawalk_check_data(cx, ip, pass, bh, height,
1480 &blks_checked, &error_blk);
1481 if (pass->big_file_msg && ip->i_blocks > COMFORTABLE_BLKS)
1482 pass->big_file_msg(cx, ip, blks_checked);
1483 }
1484 if (pass->big_file_msg && ip->i_blocks > COMFORTABLE_BLKS) {
1485 log_notice( _("\rLarge file at %"PRIu64" (0x%"PRIx64") - 100 percent "
1486 "complete. "
1487 "\n"),
1488 ip->i_num.in_addr, ip->i_num.in_addr);
1489 fflush(stdout);
1490 }
1491 undo_metalist:
1492 if (!error)
1493 goto out;
1494 log_err(_("Error: inode %"PRIu64" (0x%"PRIx64") had unrecoverable errors at "
1495 "metadata block %"PRIu64" (0x%"PRIx64"), offset %d (0x%x), block "
1496 "%"PRIu64" (0x%"PRIx64").\n"),
1497 ip->i_num.in_addr, ip->i_num.in_addr, error_blk.metablk, error_blk.metablk,
1498 error_blk.metaoff, error_blk.metaoff, error_blk.errblk, error_blk.errblk);
1499 if (!query(cx, _("Remove the invalid inode? (y/n) "))) {
1500 free_metalist(ip, metalist);
1501 log_err(_("Invalid inode not deleted.\n"));
1502 return error;
1503 }
1504 for (i = 0; pass->undo_check_meta && i < height; i++) {
1505 while (!osi_list_empty(&metalist[i])) {
1506 list = &metalist[i];
1507 bh = osi_list_entry(list->next,
1508 struct lgfs2_buffer_head,
1509 b_altlist);
1510 log_err(_("Undoing metadata work for block %"PRIu64" (0x%"PRIx64")\n"),
1511 bh->b_blocknr, bh->b_blocknr);
1512 if (i)
1513 rc = pass->undo_check_meta(cx, ip, bh->b_blocknr,
1514 i, pass->private);
1515 else
1516 rc = 0;
1517 if (metadata_clean && rc == 0 && i == height - 1 &&
1518 !hit_error_blk) {
1519 if (should_check(bh, height)) {
1520 rc = undo_check_data(cx, ip, pass,
1521 bh,
1522 height,
1523 &error_blk,
1524 error);
1525 if (rc > 0) {
1526 hit_error_blk = 1;
1527 log_err("Reached the error "
1528 "block undoing work "
1529 "for inode %"PRIu64" "
1530 "(0x%"PRIx64").\n",
1531 ip->i_num.in_addr, ip->i_num.in_addr);
1532 rc = 0;
1533 }
1534 }
1535 }
1536 if (bh == ip->i_bh)
1537 osi_list_del(&bh->b_altlist);
1538 else
1539 lgfs2_brelse(bh);
1540 }
1541 }
1542 /* There may be leftover duplicate records, so we need to delete them.
1543 For example, if a metadata block was found to be a duplicate, we
1544 may not have added it to the metalist, which means it's not there
1545 to undo. */
1546 delete_all_dups(cx, ip);
1547 /* Set the dinode as "bad" so it gets deleted */
1548 fsck_bitmap_set(cx, ip, ip->i_num.in_addr, "corrupt", GFS2_BLKST_FREE);
1549 log_err(_("The corrupt inode was invalidated.\n"));
1550 out:
1551 free_metalist(ip, metalist);
1552 return error;
1553 }
1554
1555 /* Checks stuffed inode directories */
1556 int check_linear_dir(struct fsck_cx *cx, struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
1557 struct metawalk_fxns *pass)
1558 {
1559 int error = 0;
1560 uint32_t count = 0;
1561
1562 error = check_entries(cx, ip, bh, DIR_LINEAR, &count, 0, pass);
1563 if (error < 0) {
1564 stack;
1565 return -1;
1566 }
1567
1568 return error;
1569 }
1570
1571 int check_dir(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
1572 {
1573 int error = 0;
1574
1575 if (ip->i_flags & GFS2_DIF_EXHASH)
1576 error = check_leaf_blks(cx, ip, pass);
1577 else
1578 error = check_linear_dir(cx, ip, ip->i_bh, pass);
1579
1580 if (error < 0)
1581 stack;
1582
1583 return error;
1584 }
1585