1    	#include "clusterautoconfig.h"
2    	
3    	#include <inttypes.h>
4    	#include <stdio.h>
5    	#include <stdlib.h>
6    	#include <string.h>
7    	#include <sys/types.h>
8    	#include <sys/stat.h>
9    	#include <unistd.h>
10   	#include <libintl.h>
11   	#include <ctype.h>
12   	#include <fcntl.h>
13   	#define _(String) gettext(String)
14   	
15   	#include <logging.h>
16   	#include "libgfs2.h"
17   	#include "link.h"
18   	#include "osi_tree.h"
19   	#include "fsck.h"
20   	#include "util.h"
21   	#include "metawalk.h"
22   	#include "inode_hash.h"
23   	
24   	#define COMFORTABLE_BLKS 5242880 /* 20GB in 4K blocks */
25   	
26   	/* There are two bitmaps: (1) The "blockmap" that fsck uses to keep track of
27   	   what block type has been discovered, and (2) The rgrp bitmap.  Function
28   	   blockmap_set is used to set the former and gfs2_set_bitmap
29   	   is used to set the latter.  The two must be kept in sync, otherwise
30   	   you'll get bitmap mismatches.  This function checks the status of the
31   	   bitmap whenever the blockmap changes, and fixes it accordingly. */
32   	int check_n_fix_bitmap(struct fsck_cx *cx, struct lgfs2_rgrp_tree *rgd,
33   			       uint64_t blk, int error_on_dinode, int new_state)
34   	{
35   		struct lgfs2_sbd *sdp = cx->sdp;
36   		int old_state;
37   		int treat_as_inode = 0;
38   		int rewrite_rgrp = 0;
39   		static struct lgfs2_rgrp_tree *prevrgd = NULL;
40   	
41   		if (prevrgd && rgrp_contains_block(prevrgd, blk)) {
42   			rgd = prevrgd;
43   		} else if (rgd == NULL || !rgrp_contains_block(rgd, blk)) {
44   			rgd = lgfs2_blk2rgrpd(sdp, blk);
45   			prevrgd = rgd;
46   		}
47   		old_state = lgfs2_get_bitmap(sdp, blk, rgd);
48   		if (old_state < 0) {
49   			log_err(_("Block %"PRIu64" (0x%"PRIx64") is not represented in the "
50   				   "system bitmap; part of an rgrp or superblock.\n"),
51   			        blk, blk);
52   			return -1;
53   		}
54   		if (old_state == new_state)
55   			return 0;
56   	
57   		if (error_on_dinode && old_state == GFS2_BLKST_DINODE &&
58   		    new_state != GFS2_BLKST_FREE) {
59   			log_debug(_("Reference as '%s' to block %"PRIu64" (0x%"PRIx64") which "
60   				    "was marked as dinode. Needs further investigation.\n"),
61   			          lgfs2_blkst_str(new_state), blk, blk);
62   			return 1;
63   		}
64   		/* Keep these messages as short as possible, or the output gets to be
65   		   huge and unmanageable. */
66   		log_err(_("Block %"PRIu64" (0x%"PRIx64") was '%s', should be %s.\n"),
67   		        blk, blk, lgfs2_blkst_str(old_state), lgfs2_blkst_str(new_state));
68   		if (!query(cx, _("Fix the bitmap? (y/n)"))) {
69   			log_err( _("The bitmap inconsistency was ignored.\n"));
70   			return 0;
71   		}
72   		/* If the new bitmap state is free (and therefore the old state was
73   		   not) we have to add to the free space in the rgrp. If the old
74   		   bitmap state was free (and therefore it no longer is) we have to
75   		   subtract to the free space.  If the type changed from dinode to 
76   		   data or data to dinode, no change in free space. */
77   		lgfs2_set_bitmap(rgd, blk, new_state);
78   		if (new_state == GFS2_BLKST_FREE) {
79   			rgd->rt_free++;
80   			rewrite_rgrp = 1;
81   		} else if (old_state == GFS2_BLKST_FREE) {
82   			rgd->rt_free--;
83   			rewrite_rgrp = 1;
84   		}
85   		/* If we're freeing a dinode, get rid of the data structs for it. */
86   		if (old_state == GFS2_BLKST_DINODE ||
87   		    old_state == GFS2_BLKST_UNLINKED) {
88   			struct dir_info *dt;
89   			struct inode_info *ii;
90   	
91   			dt = dirtree_find(cx, blk);
92   			if (dt) {
93   				dirtree_delete(cx, dt);
94   				treat_as_inode = 1;
95   			}
96   			ii = inodetree_find(cx, blk);
97   			if (ii) {
98   				inodetree_delete(cx, ii);
99   				treat_as_inode = 1;
100  			} else {
101  				treat_as_inode = 1;
102  			}
103  			if (old_state == GFS2_BLKST_DINODE) {
104  				if (treat_as_inode && rgd->rt_dinodes > 0)
105  					rgd->rt_dinodes--;
106  				rewrite_rgrp = 1;
107  			}
108  			link1_set(&nlink1map, blk, 0);
109  		} else if (new_state == GFS2_BLKST_DINODE) {
110  			rgd->rt_dinodes++;
111  			rewrite_rgrp = 1;
112  		}
113  		if (rewrite_rgrp) {
114  			lgfs2_rgrp_out(rgd, rgd->rt_bits[0].bi_data);
115  			rgd->rt_bits[0].bi_modified = 1;
116  		}
117  		log_err( _("The bitmap was fixed.\n"));
118  		return 0;
119  	}
120  	
121  	/*
122  	 * _fsck_bitmap_set - Mark a block in the bitmap, and adjust free space.
123  	 */
124  	int _fsck_bitmap_set(struct fsck_cx *cx, struct lgfs2_inode *ip, uint64_t bblock,
125  			     const char *btype, int mark,
126  			     int error_on_dinode, const char *caller, int fline)
127  	{
128  		int error;
129  		static int prev_ino_addr = 0;
130  		static int prev_mark = 0;
131  		static int prevcount = 0;
132  		static const char *prev_caller = NULL;
133  	
134  		if (print_level >= MSG_DEBUG) {
135  			if ((ip->i_num.in_addr == prev_ino_addr) &&
136  			    (mark == prev_mark) && caller == prev_caller) {
137  				log_info("(0x%"PRIx64") ", bblock);
138  				prevcount++;
139  				if (prevcount > 10) {
140  					log_info("\n");
141  					prevcount = 0;
142  				}
143  			/* I'm circumventing the log levels here on purpose to make the
144  			   output easier to debug. */
145  			} else if (ip->i_num.in_addr == bblock) {
146  				if (prevcount) {
147  					log_info("\n");
148  					prevcount = 0;
149  				}
150  				printf(_("(%s:%d) %s inode found at block (0x%"PRIx64"): marking as '%s'\n"),
151  				       caller, fline, btype, ip->i_num.in_addr, block_type_string(mark));
152  			} else {
153  				if (prevcount) {
154  					log_info("\n");
155  					prevcount = 0;
156  				}
157  				printf(_("(%s:%d) inode (0x%"PRIx64") references %s block"
158  				         " (0x%"PRIx64"): marking as '%s'\n"),
159  				      caller, fline, ip->i_num.in_addr, btype, bblock, block_type_string(mark));
160  			}
161  			prev_ino_addr = ip->i_num.in_addr;
162  			prev_mark = mark;
163  			prev_caller = caller;
164  		}
165  		error = check_n_fix_bitmap(cx, ip->i_rgd, bblock,
166  					   error_on_dinode, mark);
167  		if (error < 0)
168  			log_err(_("This block is not represented in the bitmap.\n"));
169  		return error;
170  	}
171  	
172  	struct duptree *dupfind(struct fsck_cx *cx, uint64_t block)
173  	{
174  		struct osi_node *node = cx->dup_blocks.osi_node;
175  	
(67) Event example_checked: Example 5: "cx->dup_blocks.osi_node" has its value checked in "node".
Also see events: [null_field][example_checked][example_checked][example_checked][example_checked][dereference]
176  		while (node) {
177  			struct duptree *dt = (struct duptree *)node;
178  	
179  			if (block < dt->block)
180  				node = node->osi_left;
181  			else if (block > dt->block)
182  				node = node->osi_right;
183  			else
184  				return dt;
185  		}
186  		return NULL;
187  	}
188  	
189  	struct lgfs2_inode *fsck_system_inode(struct lgfs2_sbd *sdp, uint64_t block)
190  	{
191  		if (lf_dip && lf_dip->i_num.in_addr == block)
192  			return lf_dip;
193  		return lgfs2_is_system_inode(sdp, block);
194  	}
195  	
196  	/* fsck_load_inode - same as gfs2_load_inode() in libgfs2 but system inodes
197  	   get special treatment. */
198  	struct lgfs2_inode *fsck_load_inode(struct lgfs2_sbd *sdp, uint64_t block)
199  	{
200  		struct lgfs2_inode *ip = NULL;
201  	
202  		ip = fsck_system_inode(sdp, block);
203  		if (ip)
204  			return ip;
205  		return lgfs2_inode_read(sdp, block);
206  	}
207  	
208  	/* fsck_inode_get - same as inode_get() in libgfs2 but system inodes
209  	   get special treatment. */
210  	struct lgfs2_inode *fsck_inode_get(struct lgfs2_sbd *sdp, struct lgfs2_rgrp_tree *rgd,
211  					  struct lgfs2_buffer_head *bh)
212  	{
213  		struct lgfs2_inode *sysip;
214  		struct lgfs2_inode *ip;
215  	
216  		sysip = fsck_system_inode(sdp, bh->b_blocknr);
217  		if (sysip)
218  			return sysip;
219  	
220  		ip = lgfs2_inode_get(sdp, bh);
221  		if (ip) {
222  			ip->i_rgd = rgd;
223  			ip->i_bh = bh;
224  		}
225  		return ip;
226  	}
227  	
228  	/* fsck_inode_put - same as lgfs2_inode_put() in libgfs2 but system inodes
229  	   get special treatment. */
230  	void fsck_inode_put(struct lgfs2_inode **ip_in)
231  	{
232  		struct lgfs2_inode *ip = *ip_in;
233  		struct lgfs2_inode *sysip;
234  	
235  		sysip = fsck_system_inode(ip->i_sbd, ip->i_num.in_addr);
236  		if (!sysip)
237  			lgfs2_inode_put(ip_in);
238  	}
239  	
240  	/**
241  	 * dirent_repair - attempt to repair a corrupt directory entry.
242  	 * @bh - The buffer header that contains the bad dirent
243  	 * @dh - The directory entry in native format
244  	 * @dent - The directory entry in on-disk format
245  	 * @type - Type of directory (DIR_LINEAR or DIR_EXHASH)
246  	 * @first - TRUE if this is the first dirent in the buffer
247  	 *
248  	 * This function tries to repair a corrupt directory entry.  All we
249  	 * know at this point is that the length field is wrong.
250  	 */
251  	static int dirent_repair(struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
252  			  struct lgfs2_dirent *d, struct gfs2_dirent *dent,
253  			  int type, int first)
254  	{
255  		char *bh_end, *p;
256  		int calc_de_name_len = 0;
257  	
258  		/* If this is a sentinel, just fix the length and move on */
259  		if (first && !d->dr_inum.in_formal_ino) { /* Is it a sentinel? */
260  			if (type == DIR_LINEAR)
261  				d->dr_rec_len = ip->i_sbd->sd_bsize -
262  					sizeof(struct gfs2_dinode);
263  			else
264  				d->dr_rec_len = ip->i_sbd->sd_bsize -
265  					sizeof(struct gfs2_leaf);
266  		} else {
267  			bh_end = bh->b_data + ip->i_sbd->sd_bsize;
268  			/* first, figure out a probable name length */
269  			p = (char *)dent + sizeof(struct gfs2_dirent);
270  			while (*p &&         /* while there's a non-zero char and */
271  			       isprint(*p) && /* a printable character and */
272  			       p < bh_end) { /* not past end of buffer */
273  				calc_de_name_len++;
274  				p++;
275  			}
276  			if (!calc_de_name_len)
277  				return 1;
278  			/* There can often be noise at the end, so only          */
279  			/* Trust the shorter of the two in case we have too much */
280  			/* Or rather, only trust ours if it's shorter.           */
281  			if (!d->dr_name_len || d->dr_name_len > GFS2_FNAMESIZE ||
282  			    calc_de_name_len < d->dr_name_len) /* if dent is hosed */
283  				d->dr_name_len = calc_de_name_len; /* use ours */
284  			d->dr_rec_len = GFS2_DIRENT_SIZE(d->dr_name_len);
285  		}
286  		lgfs2_dirent_out(d, dent);
287  		lgfs2_bmodified(bh);
288  		return 0;
289  	}
290  	
291  	/**
292  	 * dirblk_truncate - truncate a directory block
293  	 */
294  	static void dirblk_truncate(struct lgfs2_inode *ip, struct gfs2_dirent *fixb,
295  				    struct lgfs2_buffer_head *bh)
296  	{
297  		char *bh_end;
298  		struct lgfs2_dirent d;
299  	
300  		bh_end = bh->b_data + ip->i_sbd->sd_bsize;
301  		/* truncate the block to save the most dentries.  To do this we
302  		   have to patch the previous dent. */
303  		lgfs2_dirent_in(&d, fixb);
304  		d.dr_rec_len = bh_end - (char *)fixb;
305  		lgfs2_dirent_out(&d, fixb);
306  		lgfs2_bmodified(bh);
307  	}
308  	
309  	/*
310  	 * check_entries - check directory entries for a given block
311  	 *
312  	 * @ip - dinode associated with this leaf block
313  	 * bh - buffer for the leaf block
314  	 * type - type of block this is (linear or exhash)
315  	 * @count - set to the count entries
316  	 * @lindex - the last inde
317  	 * @pass - structure pointing to pass-specific functions
318  	 *
319  	 * returns: 0 - good block or it was repaired to be good
320  	 *         -1 - error occurred
321  	 */
322  	static int check_entries(struct fsck_cx *cx, struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
323  				 int type, uint32_t *count, int lindex,
324  				 struct metawalk_fxns *pass)
325  	{
326  		struct gfs2_dirent *dent, *prev;
327  		struct lgfs2_dirent d;
328  		int error = 0;
329  		char *bh_end;
330  		char *filename;
331  		int first = 1;
332  	
333  		bh_end = bh->b_data + ip->i_sbd->sd_bsize;
334  	
335  		if (type == DIR_LINEAR) {
336  			dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_dinode));
337  		} else {
338  			dent = (struct gfs2_dirent *)(bh->b_data + sizeof(struct gfs2_leaf));
339  			log_debug(_("Checking leaf %"PRIu64" (0x%"PRIx64")\n"),
340  			          bh->b_blocknr, bh->b_blocknr);
341  		}
342  	
343  		prev = NULL;
344  		if (!pass->check_dentry)
345  			return 0;
346  	
347  		while (1) {
348  			if (skip_this_pass || fsck_abort)
349  				return FSCK_OK;
350  			lgfs2_dirent_in(&d, dent);
351  			filename = (char *)dent + sizeof(struct gfs2_dirent);
352  	
353  			if (d.dr_rec_len < sizeof(struct gfs2_dirent) +
354  			    d.dr_name_len ||
355  			    (d.dr_inum.in_formal_ino && !d.dr_name_len && !first)) {
356  				log_err(_("Directory block %"PRIu64" (0x%"PRIx64"), "
357  				          "entry %d of directory %"PRIu64" (0x%"PRIx64") "
358  				          "is corrupt.\n"),
359  					bh->b_blocknr, bh->b_blocknr, (*count) + 1,
360  					ip->i_num.in_addr, ip->i_num.in_addr);
361  				if (query(cx, _("Attempt to repair it? (y/n) "))) {
362  					if (dirent_repair(ip, bh, &d, dent, type,
363  							  first)) {
364  						if (first) /* make a new sentinel */
365  							dirblk_truncate(ip, dent, bh);
366  						else
367  							dirblk_truncate(ip, prev, bh);
368  						log_err( _("Unable to repair corrupt "
369  							   "directory entry; the "
370  							   "entry was removed "
371  							   "instead.\n"));
372  						return 0;
373  					} else {
374  						log_err( _("Corrupt directory entry "
375  							   "repaired.\n"));
376  						/* keep looping through dentries */
377  					}
378  				} else {
379  					log_err( _("Corrupt directory entry ignored, "
380  						"stopped after checking %d entries.\n"),
381  						*count);
382  					return 0;
383  				}
384  			}
385  			if (!d.dr_inum.in_formal_ino) {
386  				if (first) {
387  					log_debug( _("First dirent is a sentinel (place holder).\n"));
388  					first = 0;
389  				} else {
390  					log_err(_("Directory entry with inode number of "
391  						"zero in leaf %"PRIu64" (0x%"PRIx64") of "
392  						"directory %"PRIu64" (0x%"PRIx64")!\n"),
393  						bh->b_blocknr, bh->b_blocknr,
394  						ip->i_num.in_addr, ip->i_num.in_addr);
395  					if (query(cx, _("Attempt to remove it? (y/n) "))) {
396  						dirblk_truncate(ip, prev, bh);
397  						log_err(_("The corrupt directory "
398  							  "entry was removed.\n"));
399  					} else {
400  						log_err( _("Corrupt directory entry "
401  							   "ignored, stopped after "
402  							   "checking %d entries.\n"),
403  							 *count);
404  					}
405  					return 0;
406  				}
407  			} else {
408  				if (!d.dr_inum.in_addr && first) { /* reverse sentinel */
409  					log_debug( _("First dirent is a Sentinel (place holder).\n"));
410  					/* Swap the two to silently make it a proper sentinel */
411  					d.dr_inum.in_addr = d.dr_inum.in_formal_ino;
412  					d.dr_inum.in_formal_ino = 0;
413  					lgfs2_dirent_out(&d, dent);
414  					lgfs2_bmodified(bh);
415  					/* Mark dirent buffer as modified */
416  					first = 0;
417  				} else {
418  					error = pass->check_dentry(cx, ip, dent, prev, bh,
419  								   filename, count,
420  								   &lindex,
421  								   pass->private);
422  					if (error < 0) {
423  						stack;
424  						return error;
425  					}
426  				}
427  			}
428  	
429  			if ((char *)dent + d.dr_rec_len >= bh_end){
430  				log_debug(_("Last entry processed for %"PRIu64"->%"PRIu64
431  				            "(0x%"PRIx64"->0x%"PRIx64"), di_blocks=%"PRIu64".\n"),
432  				            ip->i_num.in_addr, bh->b_blocknr, ip->i_num.in_addr,
433  				            bh->b_blocknr, ip->i_blocks);
434  				break;
435  			}
436  	
437  			/* If we didn't clear the dentry, or if we did, but it
438  			 * was the first dentry, set prev  */
439  			if (!error || first)
440  				prev = dent;
441  			first = 0;
442  			dent = (struct gfs2_dirent *)((char *)dent + d.dr_rec_len);
443  		}
444  		return 0;
445  	}
446  	
447  	/**
448  	 * check_leaf - check a leaf block for errors
449  	 * Reads in the leaf block
450  	 * Leaves the buffer around for further analysis (caller must lgfs2_brelse)
451  	 */
452  	int check_leaf(struct fsck_cx *cx, struct lgfs2_inode *ip, int lindex, struct metawalk_fxns *pass,
453  		       uint64_t *leaf_no, struct lgfs2_leaf *leaf, int *ref_count)
454  	{
455  		int error = 0, fix;
456  		struct lgfs2_buffer_head *lbh = NULL;
457  		struct gfs2_leaf *lfp;
458  		uint32_t count = 0;
459  		struct lgfs2_sbd *sdp = ip->i_sbd;
460  		const char *msg;
461  		int di_depth = ip->i_depth;
462  	
463  		/* Make sure the block number is in range. */
464  		if (!valid_block_ip(ip, *leaf_no)) {
465  			log_err( _("Leaf block #%"PRIu64" (0x%"PRIx64") is out of range for "
466  				   "directory #%"PRIu64" (0x%"PRIx64") at index %d (0x%x).\n"),
467  				 *leaf_no, *leaf_no, ip->i_num.in_addr, ip->i_num.in_addr,
468  				 lindex, lindex);
469  			msg = _("that is out of range");
470  			goto bad_leaf;
471  		}
472  	
473  		/* Try to read in the leaf block. */
474  		lbh = lgfs2_bread(sdp, *leaf_no);
475  		/* Make sure it's really a valid leaf block. */
476  		if (lgfs2_check_meta(lbh->b_data, GFS2_METATYPE_LF)) {
477  			msg = _("that is not really a leaf");
478  			goto bad_leaf;
479  		}
480  		if (pass->check_leaf_depth)
481  			error = pass->check_leaf_depth(cx, ip, *leaf_no, *ref_count, lbh);
482  	
483  		if (error >= 0 && pass->check_leaf) {
484  			error = pass->check_leaf(cx, ip, *leaf_no, pass->private);
485  			if (error == -EEXIST) {
486  				log_info(_("Previous reference to leaf %"PRIu64" (0x%"PRIx64") "
487  					   "has already checked it; skipping.\n"),
488  				         *leaf_no, *leaf_no);
489  				lgfs2_brelse(lbh);
490  				return error;
491  			}
492  		}
493  		/* Early versions of GFS2 had an endianess bug in the kernel that set
494  		   lf_dirent_format to cpu_to_be16(GFS2_FORMAT_DE).  This was fixed
495  		   to use cpu_to_be32(), but we should check for incorrect values and
496  		   replace them with the correct value. */
497  	
498  		lgfs2_leaf_in(leaf, lbh->b_data);
499  		if (leaf->lf_dirent_format == (GFS2_FORMAT_DE << 16)) {
500  			log_debug( _("incorrect lf_dirent_format at leaf #%" PRIu64
501  				     "\n"), *leaf_no);
502  			leaf->lf_dirent_format = GFS2_FORMAT_DE;
503  			lgfs2_leaf_out(leaf, lbh->b_data);
504  			lgfs2_bmodified(lbh);
505  			log_debug( _("Fixing lf_dirent_format.\n"));
506  		}
507  	
508  		lfp = (struct gfs2_leaf *)lbh->b_data;
509  		/* Make sure it's really a leaf. */
510  		if (be32_to_cpu(lfp->lf_header.mh_type) != GFS2_METATYPE_LF) {
511  			log_err(_("Inode %"PRIu64" (0x%"PRIx64") points to bad leaf %"PRIu64
512  				  " (0x%"PRIx64").\n"),
513  			        ip->i_num.in_addr, ip->i_num.in_addr, *leaf_no, *leaf_no);
514  			msg = _("that is not a leaf");
515  			goto bad_leaf;
516  		}
517  	
518  		if (pass->check_dentry && is_dir(ip)) {
519  			error = check_entries(cx, ip, lbh, DIR_EXHASH, &count, lindex,
520  					      pass);
521  	
522  			if (skip_this_pass || fsck_abort)
523  				goto out;
524  	
525  			if (error < 0) {
526  				stack;
527  				goto out; /* This seems wrong: needs investigation */
528  			}
529  	
530  			if (count == leaf->lf_entries)
531  				goto out;
532  	
533  			/* release and re-read the leaf in case check_entries
534  			   changed it. */
535  			lgfs2_brelse(lbh);
536  			lbh = lgfs2_bread(sdp, *leaf_no);
537  			lgfs2_leaf_in(leaf, lbh->b_data);
538  			if (count != leaf->lf_entries) {
539  				log_err(_("Leaf %"PRIu64" (0x%"PRIx64") entry count in "
540  					   "directory %"PRIu64" (0x%"PRIx64") does not match "
541  					   "number of entries found - is %u, found %u\n"),
542  				        *leaf_no, *leaf_no, ip->i_num.in_addr, ip->i_num.in_addr,
543  				        leaf->lf_entries, count);
544  				if (query(cx, _("Update leaf entry count? (y/n) "))) {
545  					leaf->lf_entries = count;
546  					lgfs2_leaf_out(leaf, lbh->b_data);
547  					lgfs2_bmodified(lbh);
548  					log_warn( _("Leaf entry count updated\n"));
549  				} else
550  					log_err( _("Leaf entry count left in "
551  						   "inconsistent state\n"));
552  			}
553  		}
554  	out:
555  		if (di_depth < ip->i_depth) {
556  			log_debug(_("Depth of directory %"PRIu64" (0x%"PRIx64") changed from "
557  				    "%d to %d; adjusting ref_count from %d to %d\n"),
558  			          ip->i_num.in_addr, ip->i_num.in_addr, di_depth, ip->i_depth,
559  				  *ref_count, (*ref_count) << (ip->i_depth - di_depth));
560  			(*ref_count) <<= (ip->i_depth - di_depth);
561  		}
562  		lgfs2_brelse(lbh);
563  		if (error < 0)
564  			return error;
565  		return 0;
566  	
567  	bad_leaf:
568  		if (lbh)
569  			lgfs2_brelse(lbh);
570  		if (pass->repair_leaf) {
571  			/* The leaf we read in is bad so we need to repair it. */
572  			fix = pass->repair_leaf(cx, ip, leaf_no, lindex, *ref_count, msg);
573  			if (fix < 0)
574  				return fix;
575  	
576  		}
577  		if (di_depth < ip->i_depth) {
578  			log_debug(_("Depth of directory %"PRIu64" (0x%"PRIx64") changed from "
579  				    "%d to %d. Adjusting ref_count from %d to %d\n"),
580  			          ip->i_num.in_addr, ip->i_num.in_addr, di_depth, ip->i_depth,
581  				  *ref_count, (*ref_count) << (ip->i_depth - di_depth));
582  			(*ref_count) <<= (ip->i_depth - di_depth);
583  		}
584  		return 1;
585  	}
586  	
587  	static int u64cmp(const void *p1, const void *p2)
588  	{
589  		uint64_t a = *(uint64_t *)p1;
590  		uint64_t b = *(uint64_t *)p2;
591  	
592  		if (a > b)
593  			return 1;
594  		if (a < b)
595  			return -1;
596  	
597  		return 0;
598  	}
599  	
600  	static void dir_leaf_reada(struct lgfs2_inode *ip, __be64 *tbl, unsigned hsize)
601  	{
602  		uint64_t *t = alloca(hsize * sizeof(uint64_t));
603  		uint64_t leaf_no;
604  		struct lgfs2_sbd *sdp = ip->i_sbd;
605  		unsigned n = 0;
606  		unsigned i;
607  	
608  		for (i = 0; i < hsize; i++) {
609  			leaf_no = be64_to_cpu(tbl[i]);
610  			if (valid_block_ip(ip, leaf_no))
611  				t[n++] = leaf_no * sdp->sd_bsize;
612  		}
613  		qsort(t, n, sizeof(uint64_t), u64cmp);
614  		for (i = 0; i < n; i++)
615  			(void)posix_fadvise(sdp->device_fd, t[i], sdp->sd_bsize, POSIX_FADV_WILLNEED);
616  	}
617  	
618  	/* Checks exhash directory entries */
619  	int check_leaf_blks(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
620  	{
621  		int error = 0;
622  		unsigned hsize = (1 << ip->i_depth);
623  		uint64_t leaf_no, leaf_next;
624  		uint64_t first_ok_leaf, orig_di_blocks;
625  		struct lgfs2_buffer_head *lbh;
626  		int lindex;
627  		struct lgfs2_sbd *sdp = ip->i_sbd;
628  		int ref_count, orig_ref_count, orig_di_depth, orig_di_height;
629  		__be64 *tbl;
630  		int chained_leaf, tbl_valid;
631  	
632  		tbl = get_dir_hash(ip);
633  		if (tbl == NULL) {
634  			perror("get_dir_hash");
635  			return -1;
636  		}
637  		tbl_valid = 1;
638  		orig_di_depth = ip->i_depth;
639  		orig_di_height = ip->i_height;
640  		orig_di_blocks = ip->i_blocks;
641  	
642  		/* Turn off system readahead */
643  		(void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_RANDOM);
644  	
645  		/* Readahead */
646  		dir_leaf_reada(ip, tbl, hsize);
647  	
648  		if (pass->check_hash_tbl) {
649  			error = pass->check_hash_tbl(cx, ip, tbl, hsize, pass->private);
650  			if (error < 0) {
651  				free(tbl);
652  				(void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
653  				return error;
654  			}
655  			/* If hash table changes were made, read it in again. */
656  			if (error) {
657  				free(tbl);
658  				tbl = get_dir_hash(ip);
659  				if (tbl == NULL) {
660  					perror("get_dir_hash");
661  					return -1;
662  				}
663  			}
664  		}
665  	
666  		/* Find the first valid leaf pointer in range and use it as our "old"
667  		   leaf. That way, bad blocks at the beginning will be overwritten
668  		   with the first valid leaf. */
669  		first_ok_leaf = leaf_no = -1;
670  		for (lindex = 0; lindex < hsize; lindex++) {
671  			leaf_no = be64_to_cpu(tbl[lindex]);
672  			if (valid_block_ip(ip, leaf_no)) {
673  				lbh = lgfs2_bread(sdp, leaf_no);
674  				/* Make sure it's really a valid leaf block. */
675  				if (lgfs2_check_meta(lbh->b_data, GFS2_METATYPE_LF) == 0) {
676  					lgfs2_brelse(lbh);
677  					first_ok_leaf = leaf_no;
678  					break;
679  				}
680  				lgfs2_brelse(lbh);
681  			}
682  		}
683  		if (first_ok_leaf == -1) { /* no valid leaf found */
684  			log_err(_("Directory #%"PRIu64" (0x%"PRIx64") has no valid leaf blocks\n"),
685  			        ip->i_num.in_addr, ip->i_num.in_addr);
686  			free(tbl);
687  			(void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
688  			return 1;
689  		}
690  		lindex = 0;
691  		leaf_next = -1;
692  		while (lindex < hsize) {
693  			int l;
694  	
695  			if (fsck_abort)
696  				break;
697  	
698  			if (!tbl_valid) {
699  				free(tbl);
700  				log_debug(_("Re-reading 0x%"PRIx64" hash table.\n"), ip->i_num.in_addr);
701  				tbl = get_dir_hash(ip);
702  				if (tbl == NULL) {
703  					perror("get_dir_hash");
704  					return -1;
705  				}
706  				tbl_valid = 1;
707  				orig_di_depth = ip->i_depth;
708  				orig_di_height = ip->i_height;
709  				orig_di_blocks = ip->i_blocks;
710  			}
711  			leaf_no = be64_to_cpu(tbl[lindex]);
712  	
713  			/* count the number of block pointers to this leaf. We don't
714  			   need to count the current lindex, because we already know
715  			   it's a reference */
716  			ref_count = 1;
717  	
718  			for (l = lindex + 1; l < hsize; l++) {
719  				leaf_next = be64_to_cpu(tbl[l]);
720  				if (leaf_next != leaf_no)
721  					break;
722  				ref_count++;
723  			}
724  			orig_ref_count = ref_count;
725  	
726  			chained_leaf = 0;
727  			do {
728  				struct lgfs2_leaf leaf;
729  				if (fsck_abort) {
730  					free(tbl);
731  					(void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
732  					return 0;
733  				}
734  				error = check_leaf(cx, ip, lindex, pass, &leaf_no, &leaf,
735  						   &ref_count);
736  				if (ref_count != orig_ref_count) {
737  					log_debug(_("Ref count of leaf 0x%"PRIx64
738  					            " changed from %d to %d.\n"),
739  					          leaf_no, orig_ref_count, ref_count);
740  					tbl_valid = 0;
741  				}
742  				if (error < 0) {
743  					free(tbl);
744  					return error;
745  				}
746  				if (!leaf.lf_next || error)
747  					break;
748  				leaf_no = leaf.lf_next;
749  				chained_leaf++;
750  				log_debug(_("Leaf chain #%d (0x%"PRIx64") detected.\n"),
751  				          chained_leaf, leaf_no);
752  			} while (1); /* while we have chained leaf blocks */
753  			if (orig_di_depth != ip->i_depth) {
754  				log_debug(_("Depth of 0x%"PRIx64" changed from %d to %d\n"),
755  				          ip->i_num.in_addr, orig_di_depth, ip->i_depth);
756  				tbl_valid = 0;
757  				lindex <<= (ip->i_depth - orig_di_depth);
758  				hsize = (1 << ip->i_depth);
759  			}
760  			if (orig_di_height != ip->i_height) {
761  				log_debug(_("Height of 0x%"PRIx64" changed from %d to %d\n"),
762  				          ip->i_num.in_addr, orig_di_height, ip->i_height);
763  				tbl_valid = 0;
764  			}
765  			if (orig_di_blocks != ip->i_blocks) {
766  				log_debug(_("Block count of 0x%"PRIx64" changed from %"PRIu64" to %"PRIu64"\n"),
767  				          ip->i_num.in_addr, orig_di_blocks, ip->i_blocks);
768  				tbl_valid = 0;
769  			}
770  			lindex += ref_count;
771  		} /* for every leaf block */
772  		free(tbl);
773  		(void)posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
774  		return 0;
775  	}
776  	
777  	static int check_eattr_entries(struct fsck_cx *cx, struct lgfs2_inode *ip,
778  				       struct lgfs2_buffer_head *bh,
779  				       struct metawalk_fxns *pass)
780  	{
781  		struct gfs2_ea_header *ea_hdr, *ea_hdr_prev = NULL;
782  		__be64 *ea_data_ptr = NULL;
783  		int i;
784  		int error = 0, err;
785  		uint32_t offset = (uint32_t)sizeof(struct gfs2_meta_header);
786  		uint32_t offset_limit = ip->i_sbd->sd_bsize - sizeof(struct gfs2_ea_header);
787  	
788  		if (!pass->check_eattr_entry)
789  			return 0;
790  	
791  		ea_hdr = (struct gfs2_ea_header *)(bh->b_data +
792  						  sizeof(struct gfs2_meta_header));
793  	
794  		while (1){
795  			if (ea_hdr->ea_type == GFS2_EATYPE_UNUSED)
796  				error = 0;
797  			else
798  				error = pass->check_eattr_entry(cx, ip, bh, ea_hdr,
799  								ea_hdr_prev,
800  								pass->private);
801  			if (error < 0) {
802  				stack;
803  				return -1;
804  			}
805  			if (error == 0 && pass->check_eattr_extentry &&
806  			   ea_hdr->ea_num_ptrs) {
807  				uint32_t tot_ealen = 0;
808  				struct lgfs2_sbd *sdp = ip->i_sbd;
809  	
810  				ea_data_ptr = ((__be64 *)((char *)ea_hdr +
811  							    sizeof(struct gfs2_ea_header) +
812  							    ((ea_hdr->ea_name_len + 7) & ~7)));
813  	
814  				/* It is possible when a EA is shrunk
815  				** to have ea_num_ptrs be greater than
816  				** the number required for ** data.
817  				** In this case, the EA ** code leaves
818  				** the blocks ** there for **
819  				** reuse...........  */
820  	
821  				for(i = 0; i < ea_hdr->ea_num_ptrs; i++){
822  					err = pass->check_eattr_extentry(cx, ip, i,
823  							ea_data_ptr, bh, tot_ealen,
824  							ea_hdr, ea_hdr_prev,
825  							pass->private);
826  					if (err)
827  						error = err;
828  					tot_ealen += sdp->sd_bsize -
829  						sizeof(struct gfs2_meta_header);
830  					ea_data_ptr++;
831  				}
832  			}
833  			offset += be32_to_cpu(ea_hdr->ea_rec_len);
834  			if (ea_hdr->ea_flags & GFS2_EAFLAG_LAST ||
835  			    offset > offset_limit || ea_hdr->ea_rec_len == 0) {
836  				break;
837  			}
838  			ea_hdr_prev = ea_hdr;
839  			ea_hdr = (struct gfs2_ea_header *)
840  				((char *)(ea_hdr) +
841  				 be32_to_cpu(ea_hdr->ea_rec_len));
842  		}
843  	
844  		return error;
845  	}
846  	
847  	/**
848  	 * check_leaf_eattr
849  	 * @ip: the inode the eattr comes from
850  	 * @block: block number of the leaf
851  	 *
852  	 * Returns: 0 on success, 1 if removal is needed, -1 on error
853  	 */
854  	static int check_leaf_eattr(struct fsck_cx *cx, struct lgfs2_inode *ip, uint64_t block,
855  				    uint64_t parent, struct metawalk_fxns *pass)
856  	{
857  		struct lgfs2_buffer_head *bh = NULL;
858  	
859  		if (pass->check_eattr_leaf) {
860  			int error = 0;
861  	
862  			log_debug(_("Checking EA leaf block #%"PRIu64" (0x%"PRIx64") for "
863  				     "inode #%"PRIu64" (0x%"PRIx64").\n"),
864  			          block, block, ip->i_num.in_addr, ip->i_num.in_addr);
865  	
866  			error = pass->check_eattr_leaf(cx, ip, block, parent, &bh,
867  						       pass->private);
868  			if (error < 0) {
869  				stack;
870  				return -1;
871  			}
872  			if (error > 0) {
873  				if (bh)
874  					lgfs2_brelse(bh);
875  				return 1;
876  			}
877  			if (bh) {
878  				error = check_eattr_entries(cx, ip, bh, pass);
879  				lgfs2_brelse(bh);
880  			}
881  			return error;
882  		}
883  	
884  		return 0;
885  	}
886  	
887  	/**
888  	 * check_indirect_eattr
889  	 * @ip: the inode the eattr comes from
890  	 * @indirect_block
891  	 *
892  	 * Returns: 0 on success -1 on error
893  	 */
894  	static int check_indirect_eattr(struct fsck_cx *cx, struct lgfs2_inode *ip, uint64_t indirect,
895  					struct lgfs2_buffer_head *indirect_buf,
896  					struct metawalk_fxns *pass)
897  	{
898  		int error = 0, err;
899  		__be64 *ea_leaf_ptr, *end;
900  		uint64_t block;
901  		struct lgfs2_sbd *sdp = ip->i_sbd;
902  		int first_ea_is_bad = 0;
903  		uint64_t di_eattr_save = ip->i_eattr;
904  		uint64_t offset = sizeof(struct gfs2_meta_header);
905  		int leaf_pointers = 0, leaf_pointer_errors = 0;
906  	
907  		ea_leaf_ptr = (__be64 *)(indirect_buf->b_data + offset);
908  		end = ea_leaf_ptr + ((sdp->sd_bsize - offset) / 8);
909  	
910  		while (*ea_leaf_ptr && (ea_leaf_ptr < end)){
911  			block = be64_to_cpu(*ea_leaf_ptr);
912  			leaf_pointers++;
913  			err = check_leaf_eattr(cx, ip, block, indirect, pass);
914  			if (err) {
915  				error = err;
916  				log_err(_("Error detected in leaf block %"PRIu64" (0x%"PRIx64") "
917  					  "referenced by indirect block %"PRIu64" (0x%"PRIx64").\n"),
918  				        block, block, indirect, indirect);
919  				log_err(_("Subsequent leaf block pointers should be "
920  					  "cleared.\n"));
921  			}
922  			if (error) { /* leaf blocks following an error must also be
923  					treated as error blocks and cleared. */
924  				leaf_pointer_errors++;
925  				log_err(_("Pointer to EA leaf block %"PRIu64" (0x%"PRIx64") in "
926  					  "indirect block %"PRIu64" (0x%"PRIx64") should be cleared.\n"),
927  				        block, block, indirect, indirect);
928  			}
929  			/* If the first eattr lead is bad, we can't have a hole, so we
930  			   have to treat this as an unrecoverable eattr error and
931  			   delete all eattr info. Calling finish_eattr_indir here
932  			   causes ip->i_di.di_eattr = 0 and that ensures that
933  			   subsequent calls to check_leaf_eattr result in the eattr
934  			   check_leaf_block nuking them all "due to previous errors" */
935  			if (leaf_pointers == 1 && leaf_pointer_errors == 1) {
936  				first_ea_is_bad = 1;
937  				if (pass->finish_eattr_indir)
938  					pass->finish_eattr_indir(cx, ip, leaf_pointers,
939  								 leaf_pointer_errors,
940  								 pass->private);
941  			} else if (leaf_pointer_errors) {
942  				/* This is a bit tricky.  We can't have eattr holes.
943  				   So if we have 4 good eattrs, 1 bad eattr and 5 more
944  				   good ones: GGGGBGGGGG, we need to tell
945  				   check_leaf_eattr to delete all eattrs after the bad
946  				   one. So we want: GGGG when we finish. To do that,
947  				   we set di_eattr to 0 temporarily. */
948  				ip->i_eattr = 0;
949  				lgfs2_bmodified(ip->i_bh);
950  			}
951  			ea_leaf_ptr++;
952  		}
953  		/* If we temporarily nuked the ea block to prevent checking past
954  		   a corrupt ea leaf, we need to restore the saved di_eattr block. */
955  		if (di_eattr_save != 0)
956  			ip->i_eattr = di_eattr_save;
957  		if (pass->finish_eattr_indir) {
958  			if (!first_ea_is_bad) {
959  				pass->finish_eattr_indir(cx, ip, leaf_pointers,
960  							 leaf_pointer_errors,
961  							 pass->private);
962  			}
963  			if (pass->delete_block && leaf_pointer_errors &&
964  			    leaf_pointer_errors == leaf_pointers) {
965  				pass->delete_block(cx, ip, indirect, NULL, "leaf", NULL);
966  				error = 1;
967  			}
968  		}
969  	
970  		return error;
971  	}
972  	
973  	/**
974  	 * check_inode_eattr - check the EA's for a single inode
975  	 * @ip: the inode whose EA to check
976  	 *
977  	 * Returns: 0 on success, -1 on error
978  	 */
979  	int check_inode_eattr(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
980  	{
981  		int error = 0;
982  		struct lgfs2_buffer_head *indirect_buf = NULL;
983  	
984  		if (!ip->i_eattr)
985  			return 0;
986  	
987  		if (ip->i_flags & GFS2_DIF_EA_INDIRECT){
988  			if (!pass->check_eattr_indir)
989  				return 0;
990  	
991  			log_debug(_("Checking EA indirect block #%"PRIu64" (0x%"PRIx64") for "
992  				     "inode #%"PRIu64" (0x%"PRIx64")..\n"),
993  			          ip->i_eattr, ip->i_eattr, ip->i_num.in_addr, ip->i_num.in_addr);
994  			error = pass->check_eattr_indir(cx, ip, ip->i_eattr, ip->i_num.in_addr,
995  							&indirect_buf, pass->private);
996  			if (!error) {
997  				error = check_indirect_eattr(cx, ip, ip->i_eattr,
998  							     indirect_buf, pass);
999  				if (error)
1000 					stack;
1001 			}
1002 			if (indirect_buf)
1003 				lgfs2_brelse(indirect_buf);
1004 			return error;
1005 		}
1006 		error = check_leaf_eattr(cx, ip, ip->i_eattr, ip->i_num.in_addr, pass);
1007 		if (error)
1008 			stack;
1009 	
1010 		return error;
1011 	}
1012 	
1013 	/**
1014 	 * free_metalist - free all metadata on a multi-level metadata list
1015 	 */
1016 	static void free_metalist(struct lgfs2_inode *ip, osi_list_t *mlp)
1017 	{
1018 		unsigned int height = ip->i_height;
1019 		unsigned int i;
1020 		struct lgfs2_buffer_head *nbh;
1021 	
1022 		for (i = 0; i <= height; i++) {
1023 			osi_list_t *list;
1024 	
1025 			list = &mlp[i];
1026 			while (!osi_list_empty(list)) {
1027 				nbh = osi_list_entry(list->next,
1028 						     struct lgfs2_buffer_head, b_altlist);
1029 				if (nbh == ip->i_bh)
1030 					osi_list_del_init(&nbh->b_altlist);
1031 				else
1032 					lgfs2_brelse(nbh);
1033 			}
1034 		}
1035 	}
1036 	
1037 	static void file_ra(struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
1038 			    int head_size, int maxptrs, int h)
1039 	{
1040 		struct lgfs2_sbd *sdp = ip->i_sbd;
1041 		uint64_t sblock = 0, block;
1042 		int extlen = 0;
1043 		__be64 *p;
1044 	
1045 		if (h + 2 == ip->i_height) {
1046 			p = (__be64 *)(bh->b_data + head_size);
1047 			if (*p && *(p + 1)) {
1048 				sblock = be64_to_cpu(*p);
1049 				p++;
1050 				block = be64_to_cpu(*p);
1051 				extlen = block - sblock;
1052 				if (extlen > 1 && extlen <= maxptrs) {
1053 					(void)posix_fadvise(sdp->device_fd,
1054 						      sblock * sdp->sd_bsize,
1055 						      (extlen + 1) * sdp->sd_bsize,
1056 						      POSIX_FADV_WILLNEED);
1057 					return;
1058 				}
1059 			}
1060 			extlen = 0;
1061 		}
1062 		for (p = (__be64 *)(bh->b_data + head_size);
1063 		     p < (__be64 *)(bh->b_data + sdp->sd_bsize); p++) {
1064 			if (*p) {
1065 				if (!sblock) {
1066 					sblock = be64_to_cpu(*p);
1067 					extlen = 1;
1068 					continue;
1069 				}
1070 				block = be64_to_cpu(*p);
1071 				if (block == sblock + extlen) {
1072 					extlen++;
1073 					continue;
1074 				}
1075 			}
1076 			if (extlen && sblock) {
1077 				if (extlen > 1)
1078 					extlen--;
1079 				(void)posix_fadvise(sdp->device_fd, sblock * sdp->sd_bsize,
1080 					      extlen * sdp->sd_bsize,
1081 					      POSIX_FADV_WILLNEED);
1082 				extlen = 0;
1083 				p--;
1084 			}
1085 		}
1086 		if (extlen)
1087 			(void)posix_fadvise(sdp->device_fd, sblock * sdp->sd_bsize,
1088 				      extlen * sdp->sd_bsize, POSIX_FADV_WILLNEED);
1089 	}
1090 	
1091 	static int do_check_metalist(struct fsck_cx *cx, struct iptr iptr, int height, struct lgfs2_buffer_head **bhp,
1092 	                             struct metawalk_fxns *pass)
1093 	{
1094 		struct lgfs2_inode *ip = iptr.ipt_ip;
1095 		uint64_t block = iptr_block(iptr);
1096 		int was_duplicate = 0;
1097 		int is_valid = 1;
1098 		int error;
1099 	
1100 		if (pass->check_metalist == NULL)
1101 			return 0;
1102 	
1103 		error = pass->check_metalist(cx, iptr, bhp, height, &is_valid,
1104 					     &was_duplicate, pass->private);
1105 		if (error == META_ERROR) {
1106 			stack;
1107 			log_info("\n");
1108 			log_info(_("Serious metadata error on block %"PRIu64" (0x%"PRIx64").\n"),
1109 			         block, block);
1110 			return error;
1111 		}
1112 		if (error == META_SKIP_FURTHER) {
1113 			log_info("\n");
1114 			log_info(_("Unrecoverable metadata error on block %"PRIu64" (0x%"PRIx64")\n"),
1115 			         block, block);
1116 			log_info(_("Further metadata will be skipped.\n"));
1117 			return error;
1118 		}
1119 		if (!is_valid) {
1120 			log_debug("Skipping rejected block %"PRIu64" (0x%"PRIx64")\n", block, block);
1121 			if (pass->invalid_meta_is_fatal)
1122 				return META_ERROR;
1123 			return META_SKIP_ONE;
1124 		}
1125 		if (was_duplicate) {
1126 			log_debug("Skipping duplicate %"PRIu64" (0x%"PRIx64")\n", block, block);
1127 			return META_SKIP_ONE;
1128 		}
1129 		if (!valid_block_ip(ip, block)) {
1130 			log_debug("Skipping invalid block %"PRIu64" (0x%"PRIx64")\n", block, block);
1131 			if (pass->invalid_meta_is_fatal)
1132 				return META_ERROR;
1133 			return META_SKIP_ONE;
1134 		}
1135 		return error;
1136 	}
1137 	
1138 	/**
1139 	 * build_and_check_metalist - check a bunch of indirect blocks
1140 	 *                            This includes hash table blocks for directories
1141 	 *                            which are technically "data" in the bitmap.
1142 	 *
1143 	 * Returns: 0 - all is well, process the blocks this metadata references
1144 	 *          1 - something went wrong, but process the sub-blocks anyway
1145 	 *         -1 - something went wrong, so don't process the sub-blocks
1146 	 * @ip:
1147 	 * @mlp:
1148 	 */
1149 	static int build_and_check_metalist(struct fsck_cx *cx, struct lgfs2_inode *ip, osi_list_t *mlp,
1150 					    struct metawalk_fxns *pass)
1151 	{
1152 		uint32_t height = ip->i_height;
1153 		struct lgfs2_buffer_head *metabh = ip->i_bh;
1154 		osi_list_t *prev_list, *cur_list, *tmp;
1155 		struct iptr iptr = { .ipt_ip = ip, NULL, 0};
1156 		int h, head_size, iblk_type;
1157 		__be64 *undoptr;
1158 		int maxptrs;
1159 		int error;
1160 	
1161 		osi_list_add(&metabh->b_altlist, &mlp[0]);
1162 	
1163 		/* Directories are special.  Their 'data' is the hash table, which is
1164 		   basically an indirect block list. Their height is not important
1165 		   because it checks everything through the hash table using
1166 		   "depth" field calculations. However, we still have to check the
1167 		   indirect blocks, even if the height == 1.  */
1168 		if (is_dir(ip))
1169 			height++;
1170 	
1171 		/* if (<there are no indirect blocks to check>) */
1172 		if (height < 2)
1173 			return META_IS_GOOD;
1174 		for (h = 1; h < height; h++) {
1175 			if (h > 1) {
1176 				if (is_dir(ip) &&
1177 				    h == ip->i_height + 1)
1178 					iblk_type = GFS2_METATYPE_JD;
1179 				else
1180 					iblk_type = GFS2_METATYPE_IN;
1181 				head_size = sizeof(struct gfs2_meta_header);
1182 				maxptrs = ip->i_sbd->sd_inptrs;
1183 			} else {
1184 				iblk_type = GFS2_METATYPE_DI;
1185 				head_size = sizeof(struct gfs2_dinode);
1186 				maxptrs = ip->i_sbd->sd_diptrs;
1187 			}
1188 			prev_list = &mlp[h - 1];
1189 			cur_list = &mlp[h];
1190 	
1191 			for (tmp = prev_list->next; tmp != prev_list; tmp = tmp->next) {
1192 				iptr.ipt_off = head_size;
1193 				iptr.ipt_bh = osi_list_entry(tmp, struct lgfs2_buffer_head, b_altlist);
1194 	
1195 				if (lgfs2_check_meta(iptr_buf(iptr), iblk_type)) {
1196 					if (pass->invalid_meta_is_fatal)
1197 						return META_ERROR;
1198 	
1199 					continue;
1200 				}
1201 				if (pass->readahead)
1202 					file_ra(ip, iptr.ipt_bh, head_size, maxptrs, h);
1203 	
1204 				/* Now check the metadata itself */
1205 				for (; iptr.ipt_off < ip->i_sbd->sd_bsize; iptr.ipt_off += sizeof(uint64_t)) {
1206 					struct lgfs2_buffer_head *nbh = NULL;
1207 	
1208 					if (skip_this_pass || fsck_abort)
1209 						return META_IS_GOOD;
1210 					if (!iptr_block(iptr))
1211 						continue;
1212 	
1213 					error = do_check_metalist(cx, iptr, h, &nbh, pass);
1214 					if (error == META_ERROR || error == META_SKIP_FURTHER)
1215 						goto error_undo;
1216 					if (error == META_SKIP_ONE)
1217 						continue;
1218 					if (!nbh)
1219 						nbh = lgfs2_bread(ip->i_sbd, iptr_block(iptr));
1220 					osi_list_add_prev(&nbh->b_altlist, cur_list);
1221 				} /* for all data on the indirect block */
1222 			} /* for blocks at that height */
1223 		} /* for height */
1224 		return 0;
1225 	
1226 	error_undo: /* undo what we've done so far for this block */
1227 		if (pass->undo_check_meta == NULL)
1228 			return error;
1229 	
1230 		log_info(_("Undoing the work we did before the error on block %"PRIu64" (0x%"PRIx64").\n"),
1231 		         iptr.ipt_bh->b_blocknr, iptr.ipt_bh->b_blocknr);
1232 		for (undoptr = (__be64 *)(iptr_buf(iptr) + head_size);
1233 		     undoptr < iptr_ptr(iptr) && undoptr < iptr_endptr(iptr);
1234 		     undoptr++) {
1235 			uint64_t block = be64_to_cpu(*undoptr);
1236 	
1237 			if (block == 0)
1238 				continue;
1239 	
1240 			pass->undo_check_meta(cx, ip, block, h, pass->private);
1241 		}
1242 		return error;
1243 	}
1244 	
1245 	static unsigned int hdr_size(struct lgfs2_buffer_head *bh, unsigned int height)
1246 	{
1247 		if (height > 1)
1248 			return sizeof(struct gfs2_meta_header);
1249 		return sizeof(struct gfs2_dinode);
1250 	}
1251 	
1252 	struct error_block {
1253 		uint64_t metablk; /* metadata block where error was found */
1254 		int metaoff; /* offset in that metadata block where error found */
1255 		uint64_t errblk; /* error block */
1256 	};
1257 	
1258 	static void report_data_error(uint64_t metablock, int offset, uint64_t block,
1259 				      struct error_block *error_blk,
1260 				      int rc, int error)
1261 	{
1262 		log_info("\n");
1263 		if (rc < 0) {
1264 			/* A fatal error trumps a non-fatal one. */
1265 			if ((error_blk->errblk == 0) ||
1266 			    (rc < error)) {
1267 				log_debug(_("Fatal error on metadata "
1268 					    "block 0x%"PRIx64", "
1269 					    "offset 0x%x, referencing data "
1270 					    "block 0x%"PRIx64" "
1271 					    "preempts non-fatal error on "
1272 					    "block 0x%"PRIx64"\n"),
1273 					  metablock,
1274 					  offset,
1275 					  block,
1276 					  error_blk->errblk);
1277 				error_blk->metablk = metablock;
1278 				error_blk->metaoff = offset;
1279 				error_blk->errblk = block;
1280 			}
1281 			log_info(_("Unrecoverable "));
1282 		} else { /* nonfatal error */
1283 			if (error_blk->errblk == 0) {
1284 				error_blk->metablk = metablock;
1285 				error_blk->metaoff = offset;
1286 				error_blk->errblk = block;
1287 			}
1288 		}
1289 		log_info(_("data block error %d on metadata "
1290 			   "block %"PRId64" (0x%"PRIx64"), "
1291 			   "offset %d (0x%x), referencing "
1292 			   "data block %"PRId64" (0x%"PRIx64").\n"),
1293 			 rc,
1294 			 metablock, metablock,
1295 			 offset, offset,
1296 			 block, block);
1297 	}
1298 	
1299 	/**
1300 	 * check_data - check all data pointers for a given buffer
1301 	 *              This does not include "data" blocks that are really
1302 	 *              hash table blocks for directories.
1303 	 *
1304 	 * @ip:
1305 	 *
1306 	 * returns: +ENOENT if there are too many bad pointers
1307 	 *          -1 if a more serious error occurred.
1308 	 *          0 if no errors occurred
1309 	 *          1 if errors were found and corrected
1310 	 *          2 (ENOENT) is there were too many bad pointers
1311 	 */
1312 	static int metawalk_check_data(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass,
1313 			      struct lgfs2_buffer_head *bh, unsigned int height,
1314 			      uint64_t *blks_checked, struct error_block *error_blk)
1315 	{
1316 		int error = 0, rc = 0;
1317 		uint64_t block;
1318 		__be64 *ptr_start = (__be64 *)(bh->b_data + hdr_size(bh, height));
1319 		__be64 *ptr_end = (__be64 *)(bh->b_data + ip->i_sbd->sd_bsize);
1320 		__be64 *ptr;
1321 		uint64_t metablock = bh->b_blocknr;
1322 	
1323 		/* If there isn't much pointer corruption check the pointers */
1324 		log_debug("Processing data blocks for inode 0x%"PRIx64", metadata block 0x%"PRIx64".\n",
1325 		          ip->i_num.in_addr, metablock);
1326 		for (ptr = ptr_start ; ptr < ptr_end && !fsck_abort; ptr++) {
1327 			if (!*ptr)
1328 				continue;
1329 	
1330 			if (skip_this_pass || fsck_abort)
1331 				return error;
1332 			block =  be64_to_cpu(*ptr);
1333 			/* It's important that we don't call valid_block() and
1334 			   bypass calling check_data on invalid blocks because that
1335 			   would defeat the rangecheck_block related functions in
1336 			   pass1. Therefore the individual check_data functions
1337 			   should do a range check. */
1338 			rc = pass->check_data(cx, ip, metablock, block, pass->private,
1339 					      bh, ptr);
1340 			if (rc && (!error || (rc < error))) {
1341 				report_data_error(metablock, (char *)ptr - bh->b_data, block, error_blk, rc, error);
1342 				error = rc;
1343 			}
1344 			if (rc < 0)
1345 				return rc;
1346 			(*blks_checked)++;
1347 		}
1348 		return error;
1349 	}
1350 	
1351 	static int report_undo_data_error(uint64_t metablock, int offset, uint64_t block,
1352 					  struct error_block *error_blk,
1353 					  int *found_error_blk, int error)
1354 	{
1355 		if (metablock == error_blk->metablk &&
1356 		    offset == error_blk->metaoff &&
1357 		    block == error_blk->errblk) {
1358 			if (error < 0) { /* A fatal error that stopped it? */
1359 				log_debug(_("Stopping the undo process: "
1360 					    "fatal error block 0x%"PRIx64" was "
1361 					    "found at metadata block 0x%"PRIx64","
1362 					    "offset 0x%x.\n"),
1363 					  error_blk->errblk,
1364 					  error_blk->metablk,
1365 					  error_blk->metaoff);
1366 				return 1;
1367 			}
1368 			*found_error_blk = 1;
1369 			log_debug(_("The non-fatal error block 0x%"PRIx64" was "
1370 				    "found at metadata block 0x%"PRIx64", offset "
1371 				    "0x%d, but undo processing will continue "
1372 				    "until the end of this metadata block.\n"),
1373 				  error_blk->errblk,
1374 				  error_blk->metablk,
1375 				  error_blk->metaoff);
1376 		}
1377 		return 0;
1378 	}
1379 	
1380 	static int undo_check_data(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass,
1381 				   struct lgfs2_buffer_head *bh, unsigned int height,
1382 				   struct error_block *error_blk, int error)
1383 	{
1384 		__be64 *ptr_start = (__be64 *)(bh->b_data + hdr_size(bh, height));
1385 		__be64 *ptr_end = (__be64 *)(bh->b_data + ip->i_sbd->sd_bsize);
1386 		__be64 *ptr;
1387 		uint64_t metablock = bh->b_blocknr;
1388 		int rc = 0;
1389 		uint64_t block;
1390 		int found_error_blk = 0;
1391 	
1392 		/* If there isn't much pointer corruption check the pointers */
1393 		for (ptr = ptr_start ; ptr < ptr_end && !fsck_abort; ptr++) {
1394 			if (!*ptr)
1395 				continue;
1396 	
1397 			if (skip_this_pass || fsck_abort)
1398 				return 1;
1399 			block =  be64_to_cpu(*ptr);
1400 			if (report_undo_data_error(metablock, (char *)ptr - bh->b_data,
1401 						   block, error_blk, &found_error_blk, error))
1402 				return 1;
1403 			rc = pass->undo_check_data(cx, ip, block, pass->private);
1404 			if (rc < 0)
1405 				return rc;
1406 		}
1407 		return found_error_blk;
1408 	}
1409 	
1410 	static unsigned int should_check(struct lgfs2_buffer_head *bh, unsigned int height)
1411 	{
1412 		int iblk_type = height > 1 ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
1413 	
1414 		return lgfs2_check_meta(bh->b_data, iblk_type) == 0;
1415 	}
1416 	
1417 	/**
1418 	 * check_metatree
1419 	 * @ip: inode structure in memory
1420 	 * @pass: structure passed in from caller to determine the sub-functions
1421 	 *
1422 	 */
1423 	int check_metatree(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
1424 	{
1425 		unsigned int height = ip->i_height;
1426 		osi_list_t *metalist = alloca((height + 1) * sizeof(*metalist));
1427 		osi_list_t *list, *tmp;
1428 		struct lgfs2_buffer_head *bh;
1429 		unsigned int i;
1430 		uint64_t blks_checked = 0;
1431 		int error, rc;
1432 		int metadata_clean = 0;
1433 		struct error_block error_blk = {0, 0, 0};
1434 		int hit_error_blk = 0;
1435 	
1436 		if (!height && !is_dir(ip))
1437 			return 0;
1438 	
1439 		/* metalist has one extra element for directories (see build_and_check_metalist). */
1440 		for (i = 0; i <= height; i++)
1441 			osi_list_init(&metalist[i]);
1442 	
1443 		/* create and check the metadata list for each height */
1444 		error = build_and_check_metalist(cx, ip, metalist, pass);
1445 		if (error) {
1446 			stack;
1447 			goto undo_metalist;
1448 		}
1449 	
1450 		metadata_clean = 1;
1451 		/* For directories, we've already checked the "data" blocks which
1452 		 * comprise the directory hash table, so we perform the directory
1453 		 * checks and exit. */
1454 	        if (is_dir(ip)) {
1455 			if (!(ip->i_flags & GFS2_DIF_EXHASH))
1456 				goto out;
1457 			/* check validity of leaf blocks and leaf chains */
1458 			error = check_leaf_blks(cx, ip, pass);
1459 			if (error)
1460 				goto undo_metalist;
1461 			goto out;
1462 		}
1463 	
1464 		/* check data blocks */
1465 		list = &metalist[height - 1];
1466 		if (ip->i_blocks > COMFORTABLE_BLKS)
1467 			last_reported_fblock = -10000000;
1468 	
1469 		for (tmp = list->next; !error && tmp != list; tmp = tmp->next) {
1470 			if (fsck_abort) {
1471 				free_metalist(ip, metalist);
1472 				return 0;
1473 			}
1474 			bh = osi_list_entry(tmp, struct lgfs2_buffer_head, b_altlist);
1475 			if (!should_check(bh, height))
1476 				continue;
1477 	
1478 			if (pass->check_data)
1479 				error = metawalk_check_data(cx, ip, pass, bh, height,
1480 						   &blks_checked, &error_blk);
1481 			if (pass->big_file_msg && ip->i_blocks > COMFORTABLE_BLKS)
1482 				pass->big_file_msg(cx, ip, blks_checked);
1483 		}
1484 		if (pass->big_file_msg && ip->i_blocks > COMFORTABLE_BLKS) {
1485 			log_notice( _("\rLarge file at %"PRIu64" (0x%"PRIx64") - 100 percent "
1486 				      "complete.                                   "
1487 				      "\n"),
1488 				    ip->i_num.in_addr, ip->i_num.in_addr);
1489 			fflush(stdout);
1490 		}
1491 	undo_metalist:
1492 		if (!error)
1493 			goto out;
1494 		log_err(_("Error: inode %"PRIu64" (0x%"PRIx64") had unrecoverable errors at "
1495 		          "metadata block %"PRIu64" (0x%"PRIx64"), offset %d (0x%x), block "
1496 		          "%"PRIu64" (0x%"PRIx64").\n"),
1497 		        ip->i_num.in_addr, ip->i_num.in_addr, error_blk.metablk, error_blk.metablk,
1498 			error_blk.metaoff, error_blk.metaoff, error_blk.errblk, error_blk.errblk);
1499 		if (!query(cx, _("Remove the invalid inode? (y/n) "))) {
1500 			free_metalist(ip, metalist);
1501 			log_err(_("Invalid inode not deleted.\n"));
1502 			return error;
1503 		}
1504 		for (i = 0; pass->undo_check_meta && i < height; i++) {
1505 			while (!osi_list_empty(&metalist[i])) {
1506 				list = &metalist[i];
1507 				bh = osi_list_entry(list->next,
1508 						    struct lgfs2_buffer_head,
1509 						    b_altlist);
1510 				log_err(_("Undoing metadata work for block %"PRIu64" (0x%"PRIx64")\n"),
1511 				        bh->b_blocknr, bh->b_blocknr);
1512 				if (i)
1513 					rc = pass->undo_check_meta(cx, ip, bh->b_blocknr,
1514 								   i, pass->private);
1515 				else
1516 					rc = 0;
1517 				if (metadata_clean && rc == 0 && i == height - 1 &&
1518 				    !hit_error_blk) {
1519 					if (should_check(bh, height)) {
1520 						rc = undo_check_data(cx, ip, pass,
1521 								     bh,
1522 								     height,
1523 								     &error_blk,
1524 								     error);
1525 						if (rc > 0) {
1526 							hit_error_blk = 1;
1527 							log_err("Reached the error "
1528 								"block undoing work "
1529 								"for inode %"PRIu64" "
1530 								"(0x%"PRIx64").\n",
1531 								ip->i_num.in_addr, ip->i_num.in_addr);
1532 							rc = 0;
1533 						}
1534 					}
1535 				}
1536 				if (bh == ip->i_bh)
1537 					osi_list_del(&bh->b_altlist);
1538 				else
1539 					lgfs2_brelse(bh);
1540 			}
1541 		}
1542 		/* There may be leftover duplicate records, so we need to delete them.
1543 		   For example, if a metadata block was found to be a duplicate, we
1544 		   may not have added it to the metalist, which means it's not there
1545 		   to undo. */
1546 		delete_all_dups(cx, ip);
1547 		/* Set the dinode as "bad" so it gets deleted */
1548 		fsck_bitmap_set(cx, ip, ip->i_num.in_addr, "corrupt", GFS2_BLKST_FREE);
1549 		log_err(_("The corrupt inode was invalidated.\n"));
1550 	out:
1551 		free_metalist(ip, metalist);
1552 		return error;
1553 	}
1554 	
1555 	/* Checks stuffed inode directories */
1556 	int check_linear_dir(struct fsck_cx *cx, struct lgfs2_inode *ip, struct lgfs2_buffer_head *bh,
1557 			     struct metawalk_fxns *pass)
1558 	{
1559 		int error = 0;
1560 		uint32_t count = 0;
1561 	
1562 		error = check_entries(cx, ip, bh, DIR_LINEAR, &count, 0, pass);
1563 		if (error < 0) {
1564 			stack;
1565 			return -1;
1566 		}
1567 	
1568 		return error;
1569 	}
1570 	
1571 	int check_dir(struct fsck_cx *cx, struct lgfs2_inode *ip, struct metawalk_fxns *pass)
1572 	{
1573 		int error = 0;
1574 	
1575 		if (ip->i_flags & GFS2_DIF_EXHASH)
1576 			error = check_leaf_blks(cx, ip, pass);
1577 		else
1578 			error = check_linear_dir(cx, ip, ip->i_bh, pass);
1579 	
1580 		if (error < 0)
1581 			stack;
1582 	
1583 		return error;
1584 	}
1585