diff --git a/0012-xfs-sb-verifier-doesn-t-handle-uncached-sb-buffer.patch b/0012-xfs-sb-verifier-doesn-t-handle-uncached-sb-buffer.patch new file mode 100644 index 0000000000000000000000000000000000000000..3cc5ff3c5d73ff608d2bf2f11c97fca889da6c48 --- /dev/null +++ b/0012-xfs-sb-verifier-doesn-t-handle-uncached-sb-buffer.patch @@ -0,0 +1,42 @@ +From b3749469112306a80925420b48a6e756b2beeed9 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 31 Jan 2022 15:25:48 -0500 +Subject: [PATCH] xfs: sb verifier doesn't handle uncached sb buffer + +Source kernel commit: 8cf07f3dd56195316be97758cb8b4e1d7183ea84 + +The verifier checks explicitly for bp->b_bn == XFS_SB_DADDR to match +the primary superblock buffer, but the primary superblock is an +uncached buffer and so bp->b_bn is always -1ULL. Hence this never +matches and the CRC error reporting is wholly dependent on the +mount superblock already being populated so CRC feature checks pass +and allow CRC errors to be reported. + +Fix this so that the primary superblock CRC error reporting is not +dependent on already having read the superblock into memory. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Darrick J. Wong +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_sb.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c +index b2e214e..f29a59a 100644 +--- a/libxfs/xfs_sb.c ++++ b/libxfs/xfs_sb.c +@@ -634,7 +634,7 @@ xfs_sb_read_verify( + + if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { + /* Only fail bad secondaries on a known V5 filesystem */ +- if (bp->b_bn == XFS_SB_DADDR || ++ if (bp->b_maps[0].bm_bn == XFS_SB_DADDR || + xfs_sb_version_hascrc(&mp->m_sb)) { + error = -EFSBADCRC; + goto out_error; +-- +1.8.3.1 + diff --git a/0013-libxfs-always-initialize-internal-buffer-map.patch b/0013-libxfs-always-initialize-internal-buffer-map.patch new file mode 100644 index 0000000000000000000000000000000000000000..70d84cd863d379626509557f3c968969a7ec8193 --- /dev/null +++ b/0013-libxfs-always-initialize-internal-buffer-map.patch @@ -0,0 +1,49 @@ +From f043c63e38c9582deac85053a6c8a737482983b1 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 31 Jan 2022 17:46:05 -0500 +Subject: [PATCH] libxfs: always initialize internal buffer map + +The __initbuf function is responsible for initializing the fields of an +xfs_buf. Buffers are always required to have a mapping, though in the +typical case there's only one mapping, so we can use the internal one. + +The single-mapping b_maps init code at the end of the function doesn't +quite get this right though -- if a single-mapping buffer in the cache +was allowed to expire and now is being repurposed, it'll come out with +b_maps == &__b_map, in which case we incorrectly skip initializing the +map. This has gone unnoticed until now because (AFAICT) the code paths +that use b_maps are the same ones that are called with multi-mapping +buffers, which are initialized correctly. + +Anyway, the improperly initialized single-mappings will cause problems +in upcoming patches where we turn b_bn into the cache key and require +the use of b_maps[0].bm_bn for the buffer LBA. Fix this. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + libxfs/rdwr.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c +index 5086bdb..a55e3a7 100644 +--- a/libxfs/rdwr.c ++++ b/libxfs/rdwr.c +@@ -251,9 +251,11 @@ __initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno, + bp->b_ops = NULL; + INIT_LIST_HEAD(&bp->b_li_list); + +- if (!bp->b_maps) { +- bp->b_nmaps = 1; ++ if (!bp->b_maps) + bp->b_maps = &bp->__b_map; ++ ++ if (bp->b_maps == &bp->__b_map) { ++ bp->b_nmaps = 1; + bp->b_maps[0].bm_bn = bp->b_bn; + bp->b_maps[0].bm_len = bp->b_length; + } +-- +1.8.3.1 + diff --git a/0014-libxfs-shut-down-filesystem-if-we-xfs_trans_cancel-w.patch b/0014-libxfs-shut-down-filesystem-if-we-xfs_trans_cancel-w.patch new file mode 100644 index 0000000000000000000000000000000000000000..0cd94c7a2655359ae9dcd0d2d2a94bfe9358ef5c --- /dev/null +++ b/0014-libxfs-shut-down-filesystem-if-we-xfs_trans_cancel-w.patch @@ -0,0 +1,70 @@ +From f98b7a261130726c33accd295ec0d2a22f270cde Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Fri, 25 Feb 2022 17:32:48 -0500 +Subject: [PATCH] libxfs: shut down filesystem if we xfs_trans_cancel with + deferred work items + +While debugging some very strange rmap corruption reports in connection +with the online directory repair code. I root-caused the error to the +following incorrect sequence: + + + + + + +Obviously, we should have committed the transaction instead of +cancelling it. Thinking more broadly, however, xfs_trans_cancel should +have warned us that we were throwing away work item that we already +committed to performing. This is not correct, and we need to shut down +the filesystem. + +Change xfs_trans_cancel to complain in the loudest manner if we're +cancelling any transaction with deferred work items attached. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + libxfs/trans.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/libxfs/trans.c b/libxfs/trans.c +index fd2e6f9..8c16cb8 100644 +--- a/libxfs/trans.c ++++ b/libxfs/trans.c +@@ -318,13 +318,30 @@ void + libxfs_trans_cancel( + struct xfs_trans *tp) + { ++ bool dirty; ++ + trace_xfs_trans_cancel(tp, _RET_IP_); + + if (tp == NULL) + return; ++ dirty = (tp->t_flags & XFS_TRANS_DIRTY); + +- if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) ++ /* ++ * It's never valid to cancel a transaction with deferred ops attached, ++ * because the transaction is effectively dirty. Complain about this ++ * loudly before freeing the in-memory defer items. ++ */ ++ if (!list_empty(&tp->t_dfops)) { ++ ASSERT(list_empty(&tp->t_dfops)); ++ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ++ dirty = true; + xfs_defer_cancel(tp); ++ } ++ ++ if (dirty) { ++ fprintf(stderr, _("Cancelling dirty transaction!\n")); ++ abort(); ++ } + + xfs_trans_free_items(tp); + xfs_trans_free(tp); +-- +1.8.3.1 + diff --git a/0015-xfs_db-fix-nbits-parameter-in-fa_ino-48-functions.patch b/0015-xfs_db-fix-nbits-parameter-in-fa_ino-48-functions.patch new file mode 100644 index 0000000000000000000000000000000000000000..80110abb2fc14d18582a804733204b329163d040 --- /dev/null +++ b/0015-xfs_db-fix-nbits-parameter-in-fa_ino-48-functions.patch @@ -0,0 +1,47 @@ +From e9ff33f6e604ece202373be3ac176064083d913e Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Fri, 25 Feb 2022 17:42:16 -0500 +Subject: [PATCH] xfs_db: fix nbits parameter in fa_ino[48] functions + +Use the proper macro to convert ino4 and ino8 field byte sizes to a bit +count in the functions that navigate shortform directories. This just +happens to work correctly for ino4 entries, but omits the upper 4 bytes +of an ino8 entry. Note that the entries display correctly; it's just +the command "addr u3.sfdir3.list[X].inumber.i8" that won't. + +Found by running smatch. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + db/faddr.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/db/faddr.c b/db/faddr.c +index 81d69c9..0127c5d 100644 +--- a/db/faddr.c ++++ b/db/faddr.c +@@ -353,7 +353,8 @@ fa_ino4( + xfs_ino_t ino; + + ASSERT(next == TYP_INODE); +- ino = (xfs_ino_t)getbitval(obj, bit, bitsz(XFS_INO32_SIZE), BVUNSIGNED); ++ ino = (xfs_ino_t)getbitval(obj, bit, bitize(XFS_INO32_SIZE), ++ BVUNSIGNED); + if (ino == NULLFSINO) { + dbprintf(_("null inode number, cannot set new addr\n")); + return; +@@ -370,7 +371,8 @@ fa_ino8( + xfs_ino_t ino; + + ASSERT(next == TYP_INODE); +- ino = (xfs_ino_t)getbitval(obj, bit, bitsz(XFS_INO64_SIZE), BVUNSIGNED); ++ ino = (xfs_ino_t)getbitval(obj, bit, bitize(XFS_INO64_SIZE), ++ BVUNSIGNED); + if (ino == NULLFSINO) { + dbprintf(_("null inode number, cannot set new addr\n")); + return; +-- +1.8.3.1 + diff --git a/0016-xfs_repair-update-secondary-superblocks-after-changi.patch b/0016-xfs_repair-update-secondary-superblocks-after-changi.patch new file mode 100644 index 0000000000000000000000000000000000000000..15ed64ff4955eb5b90f611991fd20e0c4928301b --- /dev/null +++ b/0016-xfs_repair-update-secondary-superblocks-after-changi.patch @@ -0,0 +1,100 @@ +From 918e82a4879dccaf3673871be925a87efc2fbabc Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Fri, 25 Feb 2022 17:42:16 -0500 +Subject: [PATCH] xfs_repair: update secondary superblocks after changing + features + +When we add features to an existing filesystem, make sure we update the +secondary superblocks to reflect the new geometry so that if we lose the +primary super in the future, repair will recover correctly. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + libxfs/libxfs_api_defs.h | 2 ++ + repair/globals.c | 1 + + repair/globals.h | 1 + + repair/phase2.c | 2 ++ + repair/xfs_repair.c | 15 +++++++++++++++ + 5 files changed, 21 insertions(+) + +diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h +index b76e638..63628ae 100644 +--- a/libxfs/libxfs_api_defs.h ++++ b/libxfs/libxfs_api_defs.h +@@ -195,6 +195,8 @@ + #define xfs_trans_roll libxfs_trans_roll + #define xfs_trim_extent libxfs_trim_extent + ++#define xfs_update_secondary_sbs libxfs_update_secondary_sbs ++ + #define xfs_validate_stripe_geometry libxfs_validate_stripe_geometry + #define xfs_verify_agbno libxfs_verify_agbno + #define xfs_verify_agino libxfs_verify_agino +diff --git a/repair/globals.c b/repair/globals.c +index 506a4e7..f8d4f1e 100644 +--- a/repair/globals.c ++++ b/repair/globals.c +@@ -48,6 +48,7 @@ char *rt_name; /* Name of realtime device */ + int rt_spec; /* Realtime dev specified as option */ + int convert_lazy_count; /* Convert lazy-count mode on/off */ + int lazy_count; /* What to set if to if converting */ ++bool features_changed; /* did we change superblock feature bits? */ + bool add_inobtcount; /* add inode btree counts to AGI */ + bool add_bigtime; /* add support for timestamps up to 2486 */ + +diff --git a/repair/globals.h b/repair/globals.h +index 929b82b..0f98bd2 100644 +--- a/repair/globals.h ++++ b/repair/globals.h +@@ -89,6 +89,7 @@ extern char *rt_name; /* Name of realtime device */ + extern int rt_spec; /* Realtime dev specified as option */ + extern int convert_lazy_count; /* Convert lazy-count mode on/off */ + extern int lazy_count; /* What to set if to if converting */ ++extern bool features_changed; /* did we change superblock feature bits? */ + extern bool add_inobtcount; /* add inode btree counts to AGI */ + extern bool add_bigtime; /* add support for timestamps up to 2486 */ + +diff --git a/repair/phase2.c b/repair/phase2.c +index 32ffe18..ab53ee0 100644 +--- a/repair/phase2.c ++++ b/repair/phase2.c +@@ -216,6 +216,8 @@ upgrade_filesystem( + } + if (bp) + libxfs_buf_relse(bp); ++ ++ features_changed = true; + } + + /* +diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c +index 38406ee..e44aa40 100644 +--- a/repair/xfs_repair.c ++++ b/repair/xfs_repair.c +@@ -1298,6 +1298,21 @@ _("Note - stripe unit (%d) and width (%d) were copied from a backup superblock.\ + libxfs_buf_relse(sbp); + + /* ++ * If we upgraded V5 filesystem features, we need to update the ++ * secondary superblocks to include the new feature bits. Don't set ++ * NEEDSREPAIR on the secondaries. ++ */ ++ if (features_changed) { ++ mp->m_sb.sb_features_incompat &= ++ ~XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR; ++ error = -libxfs_update_secondary_sbs(mp); ++ if (error) ++ do_error(_("upgrading features of secondary supers")); ++ mp->m_sb.sb_features_incompat |= ++ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR; ++ } ++ ++ /* + * Done. Flush all cached buffers and inodes first to ensure all + * verifiers are run (where we discover the max metadata LSN), reformat + * the log if necessary and unmount. +-- +1.8.3.1 + diff --git a/0017-xfs_repair-fix-AG-header-btree-level-comparisons.patch b/0017-xfs_repair-fix-AG-header-btree-level-comparisons.patch new file mode 100644 index 0000000000000000000000000000000000000000..abc29f4a55f1e553a4e1c8b284fc76ed6bf0f236 --- /dev/null +++ b/0017-xfs_repair-fix-AG-header-btree-level-comparisons.patch @@ -0,0 +1,41 @@ +From 2e9720d51a1e9efa6535b540f3c9ff88e95aabe9 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 27 Apr 2022 23:11:09 -0400 +Subject: [PATCH] xfs_repair: fix AG header btree level comparisons + +It's not an error if repair encounters a btree with the maximal +height, so don't print warnings. Also, we don't allow zero-height +btrees. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + repair/scan.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/repair/scan.c b/repair/scan.c +index 909c449..e2d281a 100644 +--- a/repair/scan.c ++++ b/repair/scan.c +@@ -2297,7 +2297,7 @@ validate_agf( + priv.nr_blocks = 0; + + levels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); +- if (levels >= XFS_BTREE_MAXLEVELS) { ++ if (levels == 0 || levels > XFS_BTREE_MAXLEVELS) { + do_warn(_("bad levels %u for rmapbt root, agno %d\n"), + levels, agno); + rmap_avoid_check(); +@@ -2323,7 +2323,7 @@ validate_agf( + unsigned int levels; + + levels = be32_to_cpu(agf->agf_refcount_level); +- if (levels >= XFS_BTREE_MAXLEVELS) { ++ if (levels == 0 || levels > XFS_BTREE_MAXLEVELS) { + do_warn(_("bad levels %u for refcountbt root, agno %d\n"), + levels, agno); + refcount_avoid_check(); +-- +1.8.3.1 + diff --git a/0018-xfs-fix-maxlevels-comparisons-in-the-btree-staging-c.patch b/0018-xfs-fix-maxlevels-comparisons-in-the-btree-staging-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..a3a5b1adba6a5e4d9288a2e571d4f757295801d2 --- /dev/null +++ b/0018-xfs-fix-maxlevels-comparisons-in-the-btree-staging-c.patch @@ -0,0 +1,47 @@ +From 0571c857fe326141e35162f5a05e6b89789840bf Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 28 Apr 2022 15:39:02 -0400 +Subject: [PATCH] xfs: fix maxlevels comparisons in the btree staging code + +Source kernel commit: 78e8ec83a404d63dcc86b251f42e4ee8aff27465 + +The btree geometry computation function has an off-by-one error in that +it does not allow maximally tall btrees (nlevels == XFS_BTREE_MAXLEVELS). +This can result in repairs failing unnecessarily on very fragmented +filesystems. Subsequent patches to remove MAXLEVELS usage in favor of +the per-btree type computations will make this a much more likely +occurrence. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Chandan Babu R +Reviewed-by: Christoph Hellwig +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_btree_staging.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libxfs/xfs_btree_staging.c b/libxfs/xfs_btree_staging.c +index 146d247..daf9979 100644 +--- a/libxfs/xfs_btree_staging.c ++++ b/libxfs/xfs_btree_staging.c +@@ -662,7 +662,7 @@ xfs_btree_bload_compute_geometry( + xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); + + bbl->nr_records = nr_this_level = nr_records; +- for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) { ++ for (cur->bc_nlevels = 1; cur->bc_nlevels <= XFS_BTREE_MAXLEVELS;) { + uint64_t level_blocks; + uint64_t dontcare64; + unsigned int level = cur->bc_nlevels - 1; +@@ -724,7 +724,7 @@ xfs_btree_bload_compute_geometry( + nr_this_level = level_blocks; + } + +- if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS) ++ if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) + return -EOVERFLOW; + + bbl->btree_height = cur->bc_nlevels; +-- +1.8.3.1 + diff --git a/0019-xfs-fold-perag-loop-iteration-logic-into-helper-func.patch b/0019-xfs-fold-perag-loop-iteration-logic-into-helper-func.patch new file mode 100644 index 0000000000000000000000000000000000000000..4f4ccc47e94d5baa2b59303d60276fe9d6b2aca8 --- /dev/null +++ b/0019-xfs-fold-perag-loop-iteration-logic-into-helper-func.patch @@ -0,0 +1,52 @@ +From b79218242b786a2c02bcac9f53fdae45e2e61e90 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 28 Apr 2022 15:39:02 -0400 +Subject: [PATCH] xfs: fold perag loop iteration logic into helper function + +Source kernel commit: bf2307b195135ed9c95eebb38920d8bd41843092 + +Fold the loop iteration logic into a helper in preparation for +further fixups. No functional change in this patch. + +Signed-off-by: Brian Foster +Reviewed-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_ag.h | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +diff --git a/libxfs/xfs_ag.h b/libxfs/xfs_ag.h +index 2522f76..95570df 100644 +--- a/libxfs/xfs_ag.h ++++ b/libxfs/xfs_ag.h +@@ -126,12 +126,22 @@ void xfs_perag_put(struct xfs_perag *pag); + * for_each_perag_from() because they terminate at sb_agcount where there are + * no perag structures in tree beyond end_agno. + */ ++static inline struct xfs_perag * ++xfs_perag_next( ++ struct xfs_perag *pag, ++ xfs_agnumber_t *next_agno) ++{ ++ struct xfs_mount *mp = pag->pag_mount; ++ ++ *next_agno = pag->pag_agno + 1; ++ xfs_perag_put(pag); ++ return xfs_perag_get(mp, *next_agno); ++} ++ + #define for_each_perag_range(mp, next_agno, end_agno, pag) \ + for ((pag) = xfs_perag_get((mp), (next_agno)); \ + (pag) != NULL && (next_agno) <= (end_agno); \ +- (next_agno) = (pag)->pag_agno + 1, \ +- xfs_perag_put(pag), \ +- (pag) = xfs_perag_get((mp), (next_agno))) ++ (pag) = xfs_perag_next((pag), &(next_agno))) + + #define for_each_perag_from(mp, next_agno, pag) \ + for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag)) +-- +1.8.3.1 + diff --git a/0020-xfs-rename-the-next_agno-perag-iteration-variable.patch b/0020-xfs-rename-the-next_agno-perag-iteration-variable.patch new file mode 100644 index 0000000000000000000000000000000000000000..571ca59972e7cf76414e62a17ba787d09590cf84 --- /dev/null +++ b/0020-xfs-rename-the-next_agno-perag-iteration-variable.patch @@ -0,0 +1,58 @@ +From 02ff0b2b4c117f33f79500815a9322fe987a4bf5 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 28 Apr 2022 15:39:02 -0400 +Subject: [PATCH] xfs: rename the next_agno perag iteration variable + +Source kernel commit: f1788b5e5ee25bedf00bb4d25f82b93820d61189 + +Rename the next_agno variable to be consistent across the several +iteration macros and shorten line length. + +Signed-off-by: Brian Foster +Reviewed-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_ag.h | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/libxfs/xfs_ag.h b/libxfs/xfs_ag.h +index 95570df..9cd0669 100644 +--- a/libxfs/xfs_ag.h ++++ b/libxfs/xfs_ag.h +@@ -129,22 +129,22 @@ void xfs_perag_put(struct xfs_perag *pag); + static inline struct xfs_perag * + xfs_perag_next( + struct xfs_perag *pag, +- xfs_agnumber_t *next_agno) ++ xfs_agnumber_t *agno) + { + struct xfs_mount *mp = pag->pag_mount; + +- *next_agno = pag->pag_agno + 1; ++ *agno = pag->pag_agno + 1; + xfs_perag_put(pag); +- return xfs_perag_get(mp, *next_agno); ++ return xfs_perag_get(mp, *agno); + } + +-#define for_each_perag_range(mp, next_agno, end_agno, pag) \ +- for ((pag) = xfs_perag_get((mp), (next_agno)); \ +- (pag) != NULL && (next_agno) <= (end_agno); \ +- (pag) = xfs_perag_next((pag), &(next_agno))) ++#define for_each_perag_range(mp, agno, end_agno, pag) \ ++ for ((pag) = xfs_perag_get((mp), (agno)); \ ++ (pag) != NULL && (agno) <= (end_agno); \ ++ (pag) = xfs_perag_next((pag), &(agno))) + +-#define for_each_perag_from(mp, next_agno, pag) \ +- for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag)) ++#define for_each_perag_from(mp, agno, pag) \ ++ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount, (pag)) + + + #define for_each_perag(mp, agno, pag) \ +-- +1.8.3.1 + diff --git a/0021-xfs-terminate-perag-iteration-reliably-on-agcount.patch b/0021-xfs-terminate-perag-iteration-reliably-on-agcount.patch new file mode 100644 index 0000000000000000000000000000000000000000..127bdc8175683fab6cbce14dd06de904a2e239ff --- /dev/null +++ b/0021-xfs-terminate-perag-iteration-reliably-on-agcount.patch @@ -0,0 +1,47 @@ +From 6c18fde82cd02e550fb0c095bd6c6908dcc77747 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 28 Apr 2022 15:39:03 -0400 +Subject: [PATCH] xfs: terminate perag iteration reliably on agcount + +Source kernel commit: 8ed004eb9d07a5d6114db3e97a166707c186262d + +The for_each_perag_from() iteration macro relies on sb_agcount to +process every perag currently within EOFS from a given starting +point. It's perfectly valid to have perag structures beyond +sb_agcount, however, such as if a growfs is in progress. If a perag +loop happens to race with growfs in this manner, it will actually +attempt to process the post-EOFS perag where ->pag_agno == +sb_agcount. This is reproduced by xfs/104 and manifests as the +following assert failure in superblock write verifier context: + +XFS: Assertion failed: agno < mp->m_sb.sb_agcount, file: fs/xfs/libxfs/xfs_types.c, line: 22 + +Update the corresponding macro to only process perags that are +within the current sb_agcount. + +Fixes: 58d43a7e3263 ("xfs: pass perags around in fsmap data dev functions") +Signed-off-by: Brian Foster +Reviewed-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_ag.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libxfs/xfs_ag.h b/libxfs/xfs_ag.h +index 9cd0669..fae2a38 100644 +--- a/libxfs/xfs_ag.h ++++ b/libxfs/xfs_ag.h +@@ -144,7 +144,7 @@ xfs_perag_next( + (pag) = xfs_perag_next((pag), &(agno))) + + #define for_each_perag_from(mp, agno, pag) \ +- for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount, (pag)) ++ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) + + + #define for_each_perag(mp, agno, pag) \ +-- +1.8.3.1 + diff --git a/0022-xfs-fix-perag-reference-leak-on-iteration-race-with-.patch b/0022-xfs-fix-perag-reference-leak-on-iteration-race-with-.patch new file mode 100644 index 0000000000000000000000000000000000000000..f2bf4d2c29e49b7d350bd2e49417d06272362dd0 --- /dev/null +++ b/0022-xfs-fix-perag-reference-leak-on-iteration-race-with-.patch @@ -0,0 +1,88 @@ +From 9619d9e715b2eba7c39683bcbc721d3954275eb4 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 28 Apr 2022 15:39:03 -0400 +Subject: [PATCH] xfs: fix perag reference leak on iteration race with growfs + +Source kernel commit: 892a666fafa19ab04b5e948f6c92f98f1dafb489 + +The for_each_perag*() set of macros are hacky in that some (i.e. +those based on sb_agcount) rely on the assumption that perag +iteration terminates naturally with a NULL perag at the specified +end_agno. Others allow for the final AG to have a valid perag and +require the calling function to clean up any potential leftover +xfs_perag reference on termination of the loop. + +Aside from providing a subtly inconsistent interface, the former +variant is racy with growfs because growfs can create discoverable +post-eofs perags before the final superblock update that completes +the grow operation and increases sb_agcount. This leads to the +following assert failure (reproduced by xfs/104) in the perag free +path during unmount: + +XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/libxfs/xfs_ag.c, line: 195 + +This occurs because one of the many for_each_perag() loops in the +code that is expected to terminate with a NULL pag (and thus has no +post-loop xfs_perag_put() check) raced with a growfs and found a +non-NULL post-EOFS perag, but terminated naturally based on the +end_agno check without releasing the post-EOFS perag. + +Rework the iteration logic to lift the agno check from the main for +loop conditional to the iteration helper function. The for loop now +purely terminates on a NULL pag and xfs_perag_next() avoids taking a +reference to any perag beyond end_agno in the first place. + +Fixes: f250eedcf762 ("xfs: make for_each_perag... a first class citizen") +Signed-off-by: Brian Foster +Reviewed-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_ag.h | 16 ++++++---------- + 1 file changed, 6 insertions(+), 10 deletions(-) + +diff --git a/libxfs/xfs_ag.h b/libxfs/xfs_ag.h +index fae2a38..e411d51 100644 +--- a/libxfs/xfs_ag.h ++++ b/libxfs/xfs_ag.h +@@ -118,30 +118,26 @@ void xfs_perag_put(struct xfs_perag *pag); + + /* + * Perag iteration APIs +- * +- * XXX: for_each_perag_range() usage really needs an iterator to clean up when +- * we terminate at end_agno because we may have taken a reference to the perag +- * beyond end_agno. Right now callers have to be careful to catch and clean that +- * up themselves. This is not necessary for the callers of for_each_perag() and +- * for_each_perag_from() because they terminate at sb_agcount where there are +- * no perag structures in tree beyond end_agno. + */ + static inline struct xfs_perag * + xfs_perag_next( + struct xfs_perag *pag, +- xfs_agnumber_t *agno) ++ xfs_agnumber_t *agno, ++ xfs_agnumber_t end_agno) + { + struct xfs_mount *mp = pag->pag_mount; + + *agno = pag->pag_agno + 1; + xfs_perag_put(pag); ++ if (*agno > end_agno) ++ return NULL; + return xfs_perag_get(mp, *agno); + } + + #define for_each_perag_range(mp, agno, end_agno, pag) \ + for ((pag) = xfs_perag_get((mp), (agno)); \ +- (pag) != NULL && (agno) <= (end_agno); \ +- (pag) = xfs_perag_next((pag), &(agno))) ++ (pag) != NULL; \ ++ (pag) = xfs_perag_next((pag), &(agno), (end_agno))) + + #define for_each_perag_from(mp, agno, pag) \ + for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) +-- +1.8.3.1 + diff --git a/0023-mkfs-fix-missing-validation-of-l-size-against-maximu.patch b/0023-mkfs-fix-missing-validation-of-l-size-against-maximu.patch new file mode 100644 index 0000000000000000000000000000000000000000..26d9214c02c12258fd1178e866a188051043769e --- /dev/null +++ b/0023-mkfs-fix-missing-validation-of-l-size-against-maximu.patch @@ -0,0 +1,91 @@ +From b6a7b627b1211f87e3bac3dc0111d056e70aa773 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 17 May 2022 22:48:12 -0400 +Subject: [PATCH] mkfs: fix missing validation of -l size against maximum + internal log size + +If a sysadmin specifies a log size explicitly, we don't actually check +that against the maximum internal log size that we compute for the +default log size computation. We're going to add more validation soon, +so refactor the max internal log blocks into a common variable and +add a check. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Eric Sandeen +--- + mkfs/xfs_mkfs.c | 36 ++++++++++++++++++++++-------------- + 1 file changed, 22 insertions(+), 14 deletions(-) + +diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c +index b7e335f..f7006af 100644 +--- a/mkfs/xfs_mkfs.c ++++ b/mkfs/xfs_mkfs.c +@@ -3268,6 +3268,7 @@ calculate_log_size( + { + struct xfs_sb *sbp = &mp->m_sb; + int min_logblocks; ++ int max_logblocks; /* absolute max for this AG */ + struct xfs_mount mount; + + /* we need a temporary mount to calculate the minimum log size. */ +@@ -3307,6 +3308,18 @@ _("external log device size %lld blocks too small, must be at least %lld blocks\ + return; + } + ++ /* ++ * Make sure the log fits wholly within an AG ++ * ++ * XXX: If agf->freeblks ends up as 0 because the log uses all ++ * the free space, it causes the kernel all sorts of problems ++ * with per-ag reservations. Right now just back it off one ++ * block, but there's a whole can of worms here that needs to be ++ * opened to decide what is the valid maximum size of a log in ++ * an AG. ++ */ ++ max_logblocks = libxfs_alloc_ag_max_usable(mp) - 1; ++ + /* internal log - if no size specified, calculate automatically */ + if (!cfg->logblocks) { + if (cfg->dblocks < GIGABYTES(1, cfg->blocklog)) { +@@ -3332,21 +3345,9 @@ _("external log device size %lld blocks too small, must be at least %lld blocks\ + cfg->logblocks = cfg->logblocks >> cfg->blocklog; + } + +- /* Ensure the chosen size meets minimum log size requirements */ ++ /* Ensure the chosen size fits within log size requirements */ + cfg->logblocks = max(min_logblocks, cfg->logblocks); +- +- /* +- * Make sure the log fits wholly within an AG +- * +- * XXX: If agf->freeblks ends up as 0 because the log uses all +- * the free space, it causes the kernel all sorts of problems +- * with per-ag reservations. Right now just back it off one +- * block, but there's a whole can of worms here that needs to be +- * opened to decide what is the valid maximum size of a log in +- * an AG. +- */ +- cfg->logblocks = min(cfg->logblocks, +- libxfs_alloc_ag_max_usable(mp) - 1); ++ cfg->logblocks = min(cfg->logblocks, max_logblocks); + + /* and now clamp the size to the maximum supported size */ + cfg->logblocks = min(cfg->logblocks, XFS_MAX_LOG_BLOCKS); +@@ -3354,6 +3355,13 @@ _("external log device size %lld blocks too small, must be at least %lld blocks\ + cfg->logblocks = XFS_MAX_LOG_BYTES >> cfg->blocklog; + + validate_log_size(cfg->logblocks, cfg->blocklog, min_logblocks); ++ } else if (cfg->logblocks > max_logblocks) { ++ /* check specified log size */ ++ fprintf(stderr, ++_("internal log size %lld too large, must be less than %d\n"), ++ (long long)cfg->logblocks, ++ max_logblocks); ++ usage(); + } + + if (cfg->logblocks > sbp->sb_agblocks - libxfs_prealloc_blocks(mp)) { +-- +1.8.3.1 + diff --git a/0024-mkfs-reduce-internal-log-size-when-log-stripe-units-.patch b/0024-mkfs-reduce-internal-log-size-when-log-stripe-units-.patch new file mode 100644 index 0000000000000000000000000000000000000000..f8c9af060148476b7c75b3571c4c26c4e130b039 --- /dev/null +++ b/0024-mkfs-reduce-internal-log-size-when-log-stripe-units-.patch @@ -0,0 +1,108 @@ +From 8d1bff2be3360572fbee9ed83e0d1c86af1614c5 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 17 May 2022 22:48:12 -0400 +Subject: [PATCH] mkfs: reduce internal log size when log stripe units are in + play + +Currently, one can feed mkfs a combination of options like this: + +$ truncate -s 6366g /tmp/a ; mkfs.xfs -f /tmp/a -d agcount=3200 -d su=256k,sw=4 +meta-data=/tmp/a isize=512 agcount=3200, agsize=521536 blks + = sectsz=512 attr=2, projid32bit=1 + = crc=1 finobt=1, sparse=1, rmapbt=0 + = reflink=1 bigtime=0 inobtcount=0 +data = bsize=4096 blocks=1668808704, imaxpct=5 + = sunit=64 swidth=256 blks +naming =version 2 bsize=4096 ascii-ci=0, ftype=1 +log =internal log bsize=4096 blocks=521536, version=2 + = sectsz=512 sunit=64 blks, lazy-count=1 +realtime =none extsz=4096 blocks=0, rtextents=0 +Metadata corruption detected at 0x55e88052c6b6, xfs_agf block 0x1/0x200 +libxfs_bwrite: write verifier failed on xfs_agf bno 0x1/0x1 +mkfs.xfs: writing AG headers failed, err=117 + +The format fails because the internal log size sizing algorithm +specifies a log size of 521492 blocks to avoid taking all the space in +the AG, but align_log_size sees the stripe unit and rounds that up to +the next stripe unit, which is 521536 blocks. + +Fix this problem by rounding the log size down if rounding up would +result in a log that consumes more space in the AG than we allow. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Eric Sandeen +--- + mkfs/xfs_mkfs.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c +index e11b39d..eb4d7fa 100644 +--- a/mkfs/xfs_mkfs.c ++++ b/mkfs/xfs_mkfs.c +@@ -3180,9 +3180,10 @@ sb_set_features( + static void + align_log_size( + struct mkfs_params *cfg, +- int sunit) ++ int sunit, ++ int max_logblocks) + { +- uint64_t tmp_logblocks; ++ uint64_t tmp_logblocks; + + /* nothing to do if it's already aligned. */ + if ((cfg->logblocks % sunit) == 0) +@@ -3199,7 +3200,8 @@ _("log size %lld is not a multiple of the log stripe unit %d\n"), + + /* If the log is too large, round down instead of round up */ + if ((tmp_logblocks > XFS_MAX_LOG_BLOCKS) || +- ((tmp_logblocks << cfg->blocklog) > XFS_MAX_LOG_BYTES)) { ++ ((tmp_logblocks << cfg->blocklog) > XFS_MAX_LOG_BYTES) || ++ tmp_logblocks > max_logblocks) { + tmp_logblocks = (cfg->logblocks / sunit) * sunit; + } + cfg->logblocks = tmp_logblocks; +@@ -3213,7 +3215,8 @@ static void + align_internal_log( + struct mkfs_params *cfg, + struct xfs_mount *mp, +- int sunit) ++ int sunit, ++ int max_logblocks) + { + uint64_t logend; + +@@ -3231,7 +3234,7 @@ _("Due to stripe alignment, the internal log start (%lld) cannot be aligned\n" + } + + /* round up/down the log size now */ +- align_log_size(cfg, sunit); ++ align_log_size(cfg, sunit, max_logblocks); + + /* check the aligned log still starts and ends in the same AG. */ + logend = cfg->logstart + cfg->logblocks - 1; +@@ -3309,7 +3312,7 @@ _("external log device size %lld blocks too small, must be at least %lld blocks\ + cfg->logstart = 0; + cfg->logagno = 0; + if (cfg->lsunit) +- align_log_size(cfg, cfg->lsunit); ++ align_log_size(cfg, cfg->lsunit, XFS_MAX_LOG_BLOCKS); + + validate_log_size(cfg->logblocks, cfg->blocklog, min_logblocks); + return; +@@ -3386,9 +3389,9 @@ _("log ag number %lld too large, must be less than %lld\n"), + * Align the logstart at stripe unit boundary. + */ + if (cfg->lsunit) { +- align_internal_log(cfg, mp, cfg->lsunit); ++ align_internal_log(cfg, mp, cfg->lsunit, max_logblocks); + } else if (cfg->dsunit) { +- align_internal_log(cfg, mp, cfg->dsunit); ++ align_internal_log(cfg, mp, cfg->dsunit, max_logblocks); + } + validate_log_size(cfg->logblocks, cfg->blocklog, min_logblocks); + } +-- +1.8.3.1 + diff --git a/0025-mkfs-don-t-let-internal-logs-bump-the-root-dir-inode.patch b/0025-mkfs-don-t-let-internal-logs-bump-the-root-dir-inode.patch new file mode 100644 index 0000000000000000000000000000000000000000..c40c93ef3ea9ac3e766c208a8b82147313347577 --- /dev/null +++ b/0025-mkfs-don-t-let-internal-logs-bump-the-root-dir-inode.patch @@ -0,0 +1,105 @@ +From 1b580a773a65eb9b2fe7f777dd6900c0d6e9a7b3 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 17 May 2022 22:48:13 -0400 +Subject: [PATCH] mkfs: don't let internal logs bump the root dir inode chunk + to AG 1 + +Currently, we don't let an internal log consume every last block in an +AG. According to the comment, we're doing this to avoid tripping AGF +verifiers if freeblks==0, but on a modern filesystem this isn't +sufficient to avoid problems because we need to have enough space in the +AG to allocate an aligned root inode chunk, if it should be the case +that the log also ends up in AG 0: + +$ truncate -s 6366g /tmp/a ; mkfs.xfs -f /tmp/a -d agcount=3200 -l agnum=0 +meta-data=/tmp/a isize=512 agcount=3200, agsize=521503 blks + = sectsz=512 attr=2, projid32bit=1 + = crc=1 finobt=1, sparse=1, rmapbt=0 + = reflink=1 bigtime=0 inobtcount=0 +data = bsize=4096 blocks=1668808704, imaxpct=5 + = sunit=0 swidth=0 blks +naming =version 2 bsize=4096 ascii-ci=0, ftype=1 +log =internal log bsize=4096 blocks=521492, version=2 + = sectsz=512 sunit=0 blks, lazy-count=1 +realtime =none extsz=4096 blocks=0, rtextents=0 +mkfs.xfs: root inode created in AG 1, not AG 0 + +Therefore, modify the maximum internal log size calculation to constrain +the maximum internal log size so that the aligned inode chunk allocation +will always succeed. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Eric Sandeen +--- + mkfs/xfs_mkfs.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 47 insertions(+) + +diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c +index eb4d7fa..0b1fb74 100644 +--- a/mkfs/xfs_mkfs.c ++++ b/mkfs/xfs_mkfs.c +@@ -3271,6 +3271,49 @@ validate_log_size(uint64_t logblocks, int blocklog, int min_logblocks) + } + + static void ++adjust_ag0_internal_logblocks( ++ struct mkfs_params *cfg, ++ struct xfs_mount *mp, ++ int min_logblocks, ++ int *max_logblocks) ++{ ++ int backoff = 0; ++ int ichunk_blocks; ++ ++ /* ++ * mkfs will trip over the write verifiers if the log is allocated in ++ * AG 0 and consumes enough space that we cannot allocate a non-sparse ++ * inode chunk for the root directory. The inode allocator requires ++ * that the AG have enough free space for the chunk itself plus enough ++ * to fix up the freelist with aligned blocks if we need to fill the ++ * allocation from the AGFL. ++ */ ++ ichunk_blocks = XFS_INODES_PER_CHUNK * cfg->inodesize >> cfg->blocklog; ++ backoff = ichunk_blocks * 4; ++ ++ /* ++ * We try to align inode allocations to the data device stripe unit, ++ * so ensure there's enough space to perform an aligned allocation. ++ * The inode geometry structure isn't set up yet, so compute this by ++ * hand. ++ */ ++ backoff = max(backoff, cfg->dsunit * 2); ++ ++ *max_logblocks -= backoff; ++ ++ /* If the specified log size is too big, complain. */ ++ if (cli_opt_set(&lopts, L_SIZE) && cfg->logblocks > *max_logblocks) { ++ fprintf(stderr, ++_("internal log size %lld too large, must be less than %d\n"), ++ (long long)cfg->logblocks, ++ *max_logblocks); ++ usage(); ++ } ++ ++ cfg->logblocks = min(cfg->logblocks, *max_logblocks); ++} ++ ++static void + calculate_log_size( + struct mkfs_params *cfg, + struct cli_params *cli, +@@ -3382,6 +3425,10 @@ _("log ag number %lld too large, must be less than %lld\n"), + } else + cfg->logagno = (xfs_agnumber_t)(sbp->sb_agcount / 2); + ++ if (cfg->logagno == 0) ++ adjust_ag0_internal_logblocks(cfg, mp, min_logblocks, ++ &max_logblocks); ++ + cfg->logstart = XFS_AGB_TO_FSB(mp, cfg->logagno, + libxfs_prealloc_blocks(mp)); + +-- +1.8.3.1 + diff --git a/0026-mkfs-improve-log-extent-validation.patch b/0026-mkfs-improve-log-extent-validation.patch new file mode 100644 index 0000000000000000000000000000000000000000..246f5aa4cd2724a9bc5594893382693fd4952372 --- /dev/null +++ b/0026-mkfs-improve-log-extent-validation.patch @@ -0,0 +1,73 @@ +From 93a199f21dd12fdef4cbcb6821e58e2c301727e2 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 17 May 2022 22:48:13 -0400 +Subject: [PATCH] mkfs: improve log extent validation + +Use the standard libxfs fsblock verifiers to check the start and end of +the internal log. The current code does not catch the case of a +(segmented) fsblock that is beyond agf_blocks but not so large to change +the agno part of the segmented fsblock. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Eric Sandeen +--- + libxfs/libxfs_api_defs.h | 1 + + mkfs/xfs_mkfs.c | 10 ++++------ + 2 files changed, 5 insertions(+), 6 deletions(-) + +diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h +index 8abbd23..370ad8b 100644 +--- a/libxfs/libxfs_api_defs.h ++++ b/libxfs/libxfs_api_defs.h +@@ -208,6 +208,7 @@ + #define xfs_verify_agino libxfs_verify_agino + #define xfs_verify_cksum libxfs_verify_cksum + #define xfs_verify_dir_ino libxfs_verify_dir_ino ++#define xfs_verify_fsbext libxfs_verify_fsbext + #define xfs_verify_fsbno libxfs_verify_fsbno + #define xfs_verify_ino libxfs_verify_ino + #define xfs_verify_rtbno libxfs_verify_rtbno +diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c +index 0b1fb74..b932aca 100644 +--- a/mkfs/xfs_mkfs.c ++++ b/mkfs/xfs_mkfs.c +@@ -3218,15 +3218,13 @@ align_internal_log( + int sunit, + int max_logblocks) + { +- uint64_t logend; +- + /* round up log start if necessary */ + if ((cfg->logstart % sunit) != 0) + cfg->logstart = ((cfg->logstart + (sunit - 1)) / sunit) * sunit; + + /* If our log start overlaps the next AG's metadata, fail. */ +- if (XFS_FSB_TO_AGBNO(mp, cfg->logstart) <= XFS_AGFL_BLOCK(mp)) { +- fprintf(stderr, ++ if (!libxfs_verify_fsbno(mp, cfg->logstart)) { ++ fprintf(stderr, + _("Due to stripe alignment, the internal log start (%lld) cannot be aligned\n" + "within an allocation group.\n"), + (long long) cfg->logstart); +@@ -3237,8 +3235,7 @@ _("Due to stripe alignment, the internal log start (%lld) cannot be aligned\n" + align_log_size(cfg, sunit, max_logblocks); + + /* check the aligned log still starts and ends in the same AG. */ +- logend = cfg->logstart + cfg->logblocks - 1; +- if (XFS_FSB_TO_AGNO(mp, cfg->logstart) != XFS_FSB_TO_AGNO(mp, logend)) { ++ if (!libxfs_verify_fsbext(mp, cfg->logstart, cfg->logblocks)) { + fprintf(stderr, + _("Due to stripe alignment, the internal log size (%lld) is too large.\n" + "Must fit within an allocation group.\n"), +@@ -3465,6 +3462,7 @@ start_superblock_setup( + sbp->sb_agblocks = (xfs_agblock_t)cfg->agsize; + sbp->sb_agblklog = (uint8_t)log2_roundup(cfg->agsize); + sbp->sb_agcount = (xfs_agnumber_t)cfg->agcount; ++ sbp->sb_dblocks = (xfs_rfsblock_t)cfg->dblocks; + + sbp->sb_inodesize = (uint16_t)cfg->inodesize; + sbp->sb_inodelog = (uint8_t)cfg->inodelog; +-- +1.8.3.1 + diff --git a/0027-xfs_repair-detect-v5-featureset-mismatches-in-second.patch b/0027-xfs_repair-detect-v5-featureset-mismatches-in-second.patch new file mode 100644 index 0000000000000000000000000000000000000000..f60420f41405bf1d405d5d235028e1033a4643fb --- /dev/null +++ b/0027-xfs_repair-detect-v5-featureset-mismatches-in-second.patch @@ -0,0 +1,130 @@ +From 2b7301269e82e86d9601392d289e38f3f66b1467 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 17 May 2022 22:48:13 -0400 +Subject: [PATCH] xfs_repair: detect v5 featureset mismatches in secondary + supers + +Make sure we detect and correct mismatches between the V5 features +described in the primary and the secondary superblocks. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +[sandeen: add comment about XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR] +Signed-off-by: Eric Sandeen +--- + repair/agheader.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 92 insertions(+) + +diff --git a/repair/agheader.c b/repair/agheader.c +index 2af2410..90adc1f 100644 +--- a/repair/agheader.c ++++ b/repair/agheader.c +@@ -221,6 +221,96 @@ compare_sb(xfs_mount_t *mp, xfs_sb_t *sb) + } + + /* ++ * If the fs feature bits on a secondary superblock don't match the ++ * primary, we need to update them. ++ */ ++static inline int ++check_v5_feature_mismatch( ++ struct xfs_mount *mp, ++ xfs_agnumber_t agno, ++ struct xfs_sb *sb) ++{ ++ bool dirty = false; ++ ++ if (!xfs_sb_version_hascrc(&mp->m_sb) || agno == 0) ++ return 0; ++ ++ if (mp->m_sb.sb_features_compat != sb->sb_features_compat) { ++ if (no_modify) { ++ do_warn( ++ _("would fix compat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_compat, ++ sb->sb_features_compat); ++ } else { ++ do_warn( ++ _("will fix compat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_compat, ++ sb->sb_features_compat); ++ dirty = true; ++ } ++ } ++ ++ /* ++ * Ignore XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR becauses the repair upgrade ++ * path sets it only on the primary while upgrading. ++ */ ++ if ((mp->m_sb.sb_features_incompat ^ sb->sb_features_incompat) & ++ ~XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) { ++ if (no_modify) { ++ do_warn( ++ _("would fix incompat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_incompat, ++ sb->sb_features_incompat); ++ } else { ++ do_warn( ++ _("will fix incompat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_incompat, ++ sb->sb_features_incompat); ++ dirty = true; ++ } ++ } ++ ++ if (mp->m_sb.sb_features_ro_compat != sb->sb_features_ro_compat) { ++ if (no_modify) { ++ do_warn( ++ _("would fix ro compat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_ro_compat, ++ sb->sb_features_ro_compat); ++ } else { ++ do_warn( ++ _("will fix ro compat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_ro_compat, ++ sb->sb_features_ro_compat); ++ dirty = true; ++ } ++ } ++ ++ if (mp->m_sb.sb_features_log_incompat != sb->sb_features_log_incompat) { ++ if (no_modify) { ++ do_warn( ++ _("would fix log incompat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_log_incompat, ++ sb->sb_features_log_incompat); ++ } else { ++ do_warn( ++ _("will fix log incompat feature mismatch in AG %u super, 0x%x != 0x%x\n"), ++ agno, mp->m_sb.sb_features_log_incompat, ++ sb->sb_features_log_incompat); ++ dirty = true; ++ } ++ } ++ ++ if (!dirty) ++ return 0; ++ ++ sb->sb_features_compat = mp->m_sb.sb_features_compat; ++ sb->sb_features_ro_compat = mp->m_sb.sb_features_ro_compat; ++ sb->sb_features_incompat = mp->m_sb.sb_features_incompat; ++ sb->sb_features_log_incompat = mp->m_sb.sb_features_log_incompat; ++ return XR_AG_SB_SEC; ++} ++ ++/* + * Possible fields that may have been set at mkfs time, + * sb_inoalignmt, sb_unit, sb_width and sb_dirblklog. + * The quota inode fields in the secondaries should be zero. +@@ -452,6 +542,8 @@ secondary_sb_whack( + rval |= XR_AG_SB_SEC; + } + ++ rval |= check_v5_feature_mismatch(mp, i, sb); ++ + if (xfs_sb_version_needsrepair(sb)) { + if (i == 0) { + if (!no_modify) +-- +1.8.3.1 + diff --git a/0028-xfs_repair-check-the-ftype-of-dot-and-dotdot-directo.patch b/0028-xfs_repair-check-the-ftype-of-dot-and-dotdot-directo.patch new file mode 100644 index 0000000000000000000000000000000000000000..d5a5f4236a9a1c2e25740ee7a75e227baf493f34 --- /dev/null +++ b/0028-xfs_repair-check-the-ftype-of-dot-and-dotdot-directo.patch @@ -0,0 +1,134 @@ +From 5008cbb4b0eaef22e5a0e13a5a2c17457671e34a Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 17 May 2022 22:48:13 -0400 +Subject: [PATCH] xfs_repair: check the ftype of dot and dotdot directory + entries + +The long-format directory block checking code skips the filetype check +for the '.' and '..' entries, even though they're part of the ondisk +format. This leads to repair failing to catch subtle corruption at the +start of a directory. + +Found by fuzzing bu[0].filetype = zeroes in xfs/386. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + repair/phase6.c | 79 +++++++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 54 insertions(+), 25 deletions(-) + +diff --git a/repair/phase6.c b/repair/phase6.c +index 696a642..06232fb 100644 +--- a/repair/phase6.c ++++ b/repair/phase6.c +@@ -1412,6 +1412,48 @@ dir2_kill_block( + _("directory shrink failed (%d)\n"), error); + } + ++static inline void ++check_longform_ftype( ++ struct xfs_mount *mp, ++ struct xfs_inode *ip, ++ xfs_dir2_data_entry_t *dep, ++ ino_tree_node_t *irec, ++ int ino_offset, ++ struct dir_hash_tab *hashtab, ++ xfs_dir2_dataptr_t addr, ++ struct xfs_da_args *da, ++ struct xfs_buf *bp) ++{ ++ xfs_ino_t inum = be64_to_cpu(dep->inumber); ++ uint8_t dir_ftype; ++ uint8_t ino_ftype; ++ ++ if (!xfs_sb_version_hasftype(&mp->m_sb)) ++ return; ++ ++ dir_ftype = libxfs_dir2_data_get_ftype(mp, dep); ++ ino_ftype = get_inode_ftype(irec, ino_offset); ++ ++ if (dir_ftype == ino_ftype) ++ return; ++ ++ if (no_modify) { ++ do_warn( ++_("would fix ftype mismatch (%d/%d) in directory/child inode %" PRIu64 "/%" PRIu64 "\n"), ++ dir_ftype, ino_ftype, ++ ip->i_ino, inum); ++ return; ++ } ++ ++ do_warn( ++_("fixing ftype mismatch (%d/%d) in directory/child inode %" PRIu64 "/%" PRIu64 "\n"), ++ dir_ftype, ino_ftype, ++ ip->i_ino, inum); ++ libxfs_dir2_data_put_ftype(mp, dep, ino_ftype); ++ libxfs_dir2_data_log_entry(da, bp, dep); ++ dir_hash_update_ftype(hashtab, addr, ino_ftype); ++} ++ + /* + * process a data block, also checks for .. entry + * and corrects it to match what we think .. should be +@@ -1749,6 +1791,11 @@ longform_dir2_entry_check_data( + libxfs_dir2_data_log_entry(&da, bp, dep); + } + } ++ ++ if (!nbad) ++ check_longform_ftype(mp, ip, dep, irec, ++ ino_offset, hashtab, addr, &da, ++ bp); + continue; + } + ASSERT(no_modify || libxfs_verify_dir_ino(mp, inum)); +@@ -1777,6 +1824,11 @@ longform_dir2_entry_check_data( + libxfs_dir2_data_log_entry(&da, bp, dep); + } + } ++ ++ if (!nbad) ++ check_longform_ftype(mp, ip, dep, irec, ++ ino_offset, hashtab, addr, &da, ++ bp); + *need_dot = 0; + continue; + } +@@ -1787,31 +1839,8 @@ longform_dir2_entry_check_data( + continue; + + /* validate ftype field if supported */ +- if (xfs_sb_version_hasftype(&mp->m_sb)) { +- uint8_t dir_ftype; +- uint8_t ino_ftype; +- +- dir_ftype = libxfs_dir2_data_get_ftype(mp, dep); +- ino_ftype = get_inode_ftype(irec, ino_offset); +- +- if (dir_ftype != ino_ftype) { +- if (no_modify) { +- do_warn( +- _("would fix ftype mismatch (%d/%d) in directory/child inode %" PRIu64 "/%" PRIu64 "\n"), +- dir_ftype, ino_ftype, +- ip->i_ino, inum); +- } else { +- do_warn( +- _("fixing ftype mismatch (%d/%d) in directory/child inode %" PRIu64 "/%" PRIu64 "\n"), +- dir_ftype, ino_ftype, +- ip->i_ino, inum); +- libxfs_dir2_data_put_ftype(mp, dep, ino_ftype); +- libxfs_dir2_data_log_entry(&da, bp, dep); +- dir_hash_update_ftype(hashtab, addr, +- ino_ftype); +- } +- } +- } ++ check_longform_ftype(mp, ip, dep, irec, ino_offset, hashtab, ++ addr, &da, bp); + + /* + * check easy case first, regular inode, just bump +-- +1.8.3.1 + diff --git a/0029-mkfs-Fix-memory-leak.patch b/0029-mkfs-Fix-memory-leak.patch new file mode 100644 index 0000000000000000000000000000000000000000..5c65db5baf8aadac0175a027dcd997810863f4f7 --- /dev/null +++ b/0029-mkfs-Fix-memory-leak.patch @@ -0,0 +1,32 @@ +From 8b4002e0cd0072dd69d478ed662f7cf546bae33b Mon Sep 17 00:00:00 2001 +From: Pavel Reichl +Date: Fri, 27 May 2022 16:36:21 -0400 +Subject: [PATCH] mkfs: Fix memory leak + +'value' is allocated by strdup() in getstr(). It +needs to be freed as we do not keep any permanent +reference to it. + +Signed-off-by: Pavel Reichl +Reviewed-by: Darrick J. Wong +Reviewed-by: Chaitanya Kulkarni +Signed-off-by: Eric Sandeen +--- + mkfs/xfs_mkfs.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c +index 01d2e8c..a37d684 100644 +--- a/mkfs/xfs_mkfs.c ++++ b/mkfs/xfs_mkfs.c +@@ -1714,6 +1714,7 @@ naming_opts_parser( + } else { + cli->sb_feat.dir_version = getnum(value, opts, subopt); + } ++ free((char *)value); + break; + case N_FTYPE: + cli->sb_feat.dirftype = getnum(value, opts, subopt); +-- +1.8.3.1 + diff --git a/0030-xfs-zero-inode-fork-buffer-at-allocation.patch b/0030-xfs-zero-inode-fork-buffer-at-allocation.patch new file mode 100644 index 0000000000000000000000000000000000000000..7bbe52fd989e9a04e7fad07974c9485c28dc9e21 --- /dev/null +++ b/0030-xfs-zero-inode-fork-buffer-at-allocation.patch @@ -0,0 +1,58 @@ +From 5a282e43fd719e37b866f797c9aacac199d08a19 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Wed, 22 Jun 2022 14:28:52 -0500 +Subject: [PATCH] xfs: zero inode fork buffer at allocation + +Source kernel commit: cb512c921639613ce03f87e62c5e93ed9fe8c84d + +When we first allocate or resize an inline inode fork, we round up +the allocation to 4 byte alingment to make journal alignment +constraints. We don't clear the unused bytes, so we can copy up to +three uninitialised bytes into the journal. Zero those bytes so we +only ever copy zeros into the journal. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Reviewed-by: Allison Henderson +Signed-off-by: Dave Chinner +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_inode_fork.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c +index da59232..ac3692b 100644 +--- a/libxfs/xfs_inode_fork.c ++++ b/libxfs/xfs_inode_fork.c +@@ -48,8 +48,13 @@ xfs_init_local_fork( + mem_size++; + + if (size) { ++ /* ++ * As we round up the allocation here, we need to ensure the ++ * bytes we don't copy data into are zeroed because the log ++ * vectors still copy them into the journal. ++ */ + real_size = roundup(mem_size, 4); +- ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); ++ ifp->if_u1.if_data = kmem_zalloc(real_size, KM_NOFS); + memcpy(ifp->if_u1.if_data, data, size); + if (zero_terminate) + ifp->if_u1.if_data[size] = '\0'; +@@ -498,10 +503,11 @@ xfs_idata_realloc( + /* + * For inline data, the underlying buffer must be a multiple of 4 bytes + * in size so that it can be logged and stay on word boundaries. +- * We enforce that here. ++ * We enforce that here, and use __GFP_ZERO to ensure that size ++ * extensions always zero the unused roundup area. + */ + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), +- GFP_NOFS | __GFP_NOFAIL); ++ GFP_NOFS | __GFP_NOFAIL | __GFP_ZERO); + ifp->if_bytes = new_size; + } + +-- +1.8.3.1 + diff --git a/0031-xfs-detect-self-referencing-btree-sibling-pointers.patch b/0031-xfs-detect-self-referencing-btree-sibling-pointers.patch new file mode 100644 index 0000000000000000000000000000000000000000..226d1ce18eab3814dd00732bb10e54f9edcb9cfa --- /dev/null +++ b/0031-xfs-detect-self-referencing-btree-sibling-pointers.patch @@ -0,0 +1,237 @@ +From 393859c7a197c8187ffec131ec80cca697f8bf79 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Wed, 22 Jun 2022 14:28:52 -0500 +Subject: [PATCH] xfs: detect self referencing btree sibling pointers + +Source kernel commit: dc04db2aa7c9307e740d6d0e173085301c173b1a + +To catch the obvious graph cycle problem and hence potential endless +looping. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Dave Chinner +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_btree.c | 140 +++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 105 insertions(+), 35 deletions(-) + +diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c +index 8455f26..d9a82e7 100644 +--- a/libxfs/xfs_btree.c ++++ b/libxfs/xfs_btree.c +@@ -48,6 +48,52 @@ xfs_btree_magic( + return magic; + } + ++static xfs_failaddr_t ++xfs_btree_check_lblock_siblings( ++ struct xfs_mount *mp, ++ struct xfs_btree_cur *cur, ++ int level, ++ xfs_fsblock_t fsb, ++ xfs_fsblock_t sibling) ++{ ++ if (sibling == NULLFSBLOCK) ++ return NULL; ++ if (sibling == fsb) ++ return __this_address; ++ if (level >= 0) { ++ if (!xfs_btree_check_lptr(cur, sibling, level + 1)) ++ return __this_address; ++ } else { ++ if (!xfs_verify_fsbno(mp, sibling)) ++ return __this_address; ++ } ++ ++ return NULL; ++} ++ ++static xfs_failaddr_t ++xfs_btree_check_sblock_siblings( ++ struct xfs_mount *mp, ++ struct xfs_btree_cur *cur, ++ int level, ++ xfs_agnumber_t agno, ++ xfs_agblock_t agbno, ++ xfs_agblock_t sibling) ++{ ++ if (sibling == NULLAGBLOCK) ++ return NULL; ++ if (sibling == agbno) ++ return __this_address; ++ if (level >= 0) { ++ if (!xfs_btree_check_sptr(cur, sibling, level + 1)) ++ return __this_address; ++ } else { ++ if (!xfs_verify_agbno(mp, agno, sibling)) ++ return __this_address; ++ } ++ return NULL; ++} ++ + /* + * Check a long btree block header. Return the address of the failing check, + * or NULL if everything is ok. +@@ -62,6 +108,8 @@ __xfs_btree_check_lblock( + struct xfs_mount *mp = cur->bc_mp; + xfs_btnum_t btnum = cur->bc_btnum; + int crc = xfs_sb_version_hascrc(&mp->m_sb); ++ xfs_failaddr_t fa; ++ xfs_fsblock_t fsb = NULLFSBLOCK; + + if (crc) { + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) +@@ -80,16 +128,16 @@ __xfs_btree_check_lblock( + if (be16_to_cpu(block->bb_numrecs) > + cur->bc_ops->get_maxrecs(cur, level)) + return __this_address; +- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && +- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), +- level + 1)) +- return __this_address; +- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && +- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), +- level + 1)) +- return __this_address; + +- return NULL; ++ if (bp) ++ fsb = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); ++ ++ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, ++ be64_to_cpu(block->bb_u.l.bb_leftsib)); ++ if (!fa) ++ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, ++ be64_to_cpu(block->bb_u.l.bb_rightsib)); ++ return fa; + } + + /* Check a long btree block header. */ +@@ -127,6 +175,9 @@ __xfs_btree_check_sblock( + struct xfs_mount *mp = cur->bc_mp; + xfs_btnum_t btnum = cur->bc_btnum; + int crc = xfs_sb_version_hascrc(&mp->m_sb); ++ xfs_failaddr_t fa; ++ xfs_agblock_t agbno = NULLAGBLOCK; ++ xfs_agnumber_t agno = NULLAGNUMBER; + + if (crc) { + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) +@@ -143,16 +194,18 @@ __xfs_btree_check_sblock( + if (be16_to_cpu(block->bb_numrecs) > + cur->bc_ops->get_maxrecs(cur, level)) + return __this_address; +- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && +- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), +- level + 1)) +- return __this_address; +- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && +- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), +- level + 1)) +- return __this_address; + +- return NULL; ++ if (bp) { ++ agbno = xfs_daddr_to_agbno(mp, XFS_BUF_ADDR(bp)); ++ agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); ++ } ++ ++ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, ++ be32_to_cpu(block->bb_u.s.bb_leftsib)); ++ if (!fa) ++ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, ++ agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); ++ return fa; + } + + /* Check a short btree block header. */ +@@ -4265,6 +4318,21 @@ xfs_btree_visit_block( + if (xfs_btree_ptr_is_null(cur, &rptr)) + return -ENOENT; + ++ /* ++ * We only visit blocks once in this walk, so we have to avoid the ++ * internal xfs_btree_lookup_get_block() optimisation where it will ++ * return the same block without checking if the right sibling points ++ * back to us and creates a cyclic reference in the btree. ++ */ ++ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { ++ if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, ++ XFS_BUF_ADDR(bp))) ++ return -EFSCORRUPTED; ++ } else { ++ if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, ++ XFS_BUF_ADDR(bp))) ++ return -EFSCORRUPTED; ++ } + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); + } + +@@ -4439,20 +4507,21 @@ xfs_btree_lblock_verify( + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); ++ xfs_fsblock_t fsb; ++ xfs_failaddr_t fa; + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return __this_address; + + /* sibling pointer verification */ +- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && +- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) +- return __this_address; +- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && +- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) +- return __this_address; +- +- return NULL; ++ fsb = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); ++ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, ++ be64_to_cpu(block->bb_u.l.bb_leftsib)); ++ if (!fa) ++ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, ++ be64_to_cpu(block->bb_u.l.bb_rightsib)); ++ return fa; + } + + /** +@@ -4493,7 +4562,9 @@ xfs_btree_sblock_verify( + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); +- xfs_agblock_t agno; ++ xfs_agnumber_t agno; ++ xfs_agblock_t agbno; ++ xfs_failaddr_t fa; + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) +@@ -4501,14 +4572,13 @@ xfs_btree_sblock_verify( + + /* sibling pointer verification */ + agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); +- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && +- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) +- return __this_address; +- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && +- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) +- return __this_address; +- +- return NULL; ++ agbno = xfs_daddr_to_agbno(mp, XFS_BUF_ADDR(bp)); ++ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, ++ be32_to_cpu(block->bb_u.s.bb_leftsib)); ++ if (!fa) ++ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, ++ be32_to_cpu(block->bb_u.s.bb_rightsib)); ++ return fa; + } + + /* +-- +1.8.3.1 + diff --git a/0032-xfs-validate-inode-fork-size-against-fork-format.patch b/0032-xfs-validate-inode-fork-size-against-fork-format.patch new file mode 100644 index 0000000000000000000000000000000000000000..a995dac32106313a0b5525434180690e32c69dce --- /dev/null +++ b/0032-xfs-validate-inode-fork-size-against-fork-format.patch @@ -0,0 +1,84 @@ +From ff6aea290450f00e084cafe5b34901d26abdbc4a Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Wed, 22 Jun 2022 14:28:52 -0500 +Subject: [PATCH] xfs: validate inode fork size against fork format + +Source kernel commit: 1eb70f54c445fcbb25817841e774adb3d912f3e8 + +xfs_repair catches fork size/format mismatches, but the in-kernel +verifier doesn't, leading to null pointer failures when attempting +to perform operations on the fork. This can occur in the +xfs_dir_is_empty() where the in-memory fork format does not match +the size and so the fork data pointer is accessed incorrectly. + +Note: this causes new failures in xfs/348 which is testing mode vs +ftype mismatches. We now detect a regular file that has been changed +to a directory or symlink mode as being corrupt because the data +fork is for a symlink or directory should be in local form when +there are only 3 bytes of data in the data fork. Hence the inode +verify for the regular file now fires w/ -EFSCORRUPTED because +the inode fork format does not match the format the corrupted mode +says it should be in. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Dave Chinner +Signed-off-by: Eric Sandeen +--- + libxfs/xfs_inode_buf.c | 35 ++++++++++++++++++++++++++--------- + 1 file changed, 26 insertions(+), 9 deletions(-) + +diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c +index f98f5c4..7ecbfad 100644 +--- a/libxfs/xfs_inode_buf.c ++++ b/libxfs/xfs_inode_buf.c +@@ -334,19 +334,36 @@ xfs_dinode_verify_fork( + int whichfork) + { + uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); ++ mode_t mode = be16_to_cpu(dip->di_mode); ++ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); ++ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); + +- switch (XFS_DFORK_FORMAT(dip, whichfork)) { ++ /* ++ * For fork types that can contain local data, check that the fork ++ * format matches the size of local data contained within the fork. ++ * ++ * For all types, check that when the size says the should be in extent ++ * or btree format, the inode isn't claiming it is in local format. ++ */ ++ if (whichfork == XFS_DATA_FORK) { ++ if (S_ISDIR(mode) || S_ISLNK(mode)) { ++ if (be64_to_cpu(dip->di_size) <= fork_size && ++ fork_format != XFS_DINODE_FMT_LOCAL) ++ return __this_address; ++ } ++ ++ if (be64_to_cpu(dip->di_size) > fork_size && ++ fork_format == XFS_DINODE_FMT_LOCAL) ++ return __this_address; ++ } ++ ++ switch (fork_format) { + case XFS_DINODE_FMT_LOCAL: + /* +- * no local regular files yet ++ * No local regular files yet + */ +- if (whichfork == XFS_DATA_FORK) { +- if (S_ISREG(be16_to_cpu(dip->di_mode))) +- return __this_address; +- if (be64_to_cpu(dip->di_size) > +- XFS_DFORK_SIZE(dip, mp, whichfork)) +- return __this_address; +- } ++ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) ++ return __this_address; + if (di_nextents) + return __this_address; + break; +-- +1.8.3.1 + diff --git a/0033-xfs_repair-always-rewrite-secondary-supers-when-need.patch b/0033-xfs_repair-always-rewrite-secondary-supers-when-need.patch new file mode 100644 index 0000000000000000000000000000000000000000..e84ad1a4b5f09f47a8fc025c1b40242cd9947536 --- /dev/null +++ b/0033-xfs_repair-always-rewrite-secondary-supers-when-need.patch @@ -0,0 +1,110 @@ +From fa0f9232bd89e2955ee54e0be4adb6713a00d8b4 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 12 Jul 2022 13:22:33 -0500 +Subject: [PATCH] xfs_repair: always rewrite secondary supers when needsrepair + is set + +Dave Chinner complained about xfs_scrub failures coming from xfs/158. +That test induces xfs_repair to fail while upgrading a filesystem to +have the inobtcount feature, and then restarts xfs_repair to finish the +upgrade. When the second xfs_repair run starts, it will find that the +primary super has NEEDSREPAIR set, along with whatever new feature that +we were trying to add to the filesystem. + +From there, repair completes the upgrade in much the same manner as the +first repair run would have, with one big exception -- it forgets to set +features_changed to trigger rewriting of the secondary supers at the end +of repair. This results in discrepancies between the supers: + +# XFS_REPAIR_FAIL_AFTER_PHASE=2 xfs_repair -c inobtcount=1 /dev/sdf +Phase 1 - find and verify superblock... +Phase 2 - using internal log + - zero log... + - scan filesystem freespace and inode maps... + - found root inode chunk +Adding inode btree counts to filesystem. +Killed +# xfs_repair /dev/sdf +Phase 1 - find and verify superblock... +Phase 2 - using internal log + - zero log... + - scan filesystem freespace and inode maps... +clearing needsrepair flag and regenerating metadata +bad inobt block count 0, saw 1 +bad finobt block count 0, saw 1 +bad inobt block count 0, saw 1 +bad finobt block count 0, saw 1 +bad inobt block count 0, saw 1 +bad finobt block count 0, saw 1 +bad inobt block count 0, saw 1 +bad finobt block count 0, saw 1 + - found root inode chunk +Phase 3 - for each AG... + - scan and clear agi unlinked lists... + - process known inodes and perform inode discovery... + - agno = 0 + - agno = 1 + - agno = 2 + - agno = 3 + - process newly discovered inodes... +Phase 4 - check for duplicate blocks... + - setting up duplicate extent list... + - check for inodes claiming duplicate blocks... + - agno = 1 + - agno = 2 + - agno = 0 + - agno = 3 +Phase 5 - rebuild AG headers and trees... + - reset superblock... +Phase 6 - check inode connectivity... + - resetting contents of realtime bitmap and summary inodes + - traversing filesystem ... + - traversal finished ... + - moving disconnected inodes to lost+found ... +Phase 7 - verify and correct link counts... +done +# xfs_db -c 'sb 0' -c 'print' -c 'sb 1' -c 'print' /dev/sdf | \ + egrep '(features_ro_compat|features_incompat)' +features_ro_compat = 0xd +features_incompat = 0xb +features_ro_compat = 0x5 +features_incompat = 0xb + +Curiously, re-running xfs_repair will not trigger any warnings about the +featureset mismatch between the primary and secondary supers. xfs_scrub +immediately notices, which is what causes xfs/158 to fail. + +This discrepancy doesn't happen when the upgrade completes successfully +in a single repair run, so we need to teach repair to rewrite the +secondaries at the end of repair any time needsrepair was set. + +Reported-by: Dave Chinner +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Eric Sandeen +--- + repair/agheader.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/repair/agheader.c b/repair/agheader.c +index 36da139..e91509d 100644 +--- a/repair/agheader.c ++++ b/repair/agheader.c +@@ -552,6 +552,14 @@ secondary_sb_whack( + else + do_warn( + _("would clear needsrepair flag and regenerate metadata\n")); ++ /* ++ * If needsrepair is set on the primary super, there's ++ * a possibility that repair crashed during an upgrade. ++ * Set features_changed to ensure that the secondary ++ * supers are rewritten with the new feature bits once ++ * we've finished the upgrade. ++ */ ++ features_changed = true; + } else { + /* + * Quietly clear needsrepair on the secondary supers as +-- +1.8.3.1 + diff --git a/0034-xfs_repair-ignore-empty-xattr-leaf-blocks.patch b/0034-xfs_repair-ignore-empty-xattr-leaf-blocks.patch new file mode 100644 index 0000000000000000000000000000000000000000..65448ed4139356f51f350736416fe31a3c0d5287 --- /dev/null +++ b/0034-xfs_repair-ignore-empty-xattr-leaf-blocks.patch @@ -0,0 +1,55 @@ +From f50d3462c654acc484ab3ea68e75e8252b77e262 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 13 Jul 2022 20:58:25 -0500 +Subject: [PATCH] xfs_repair: ignore empty xattr leaf blocks + +As detailed in the commit: + +5e572d1a xfs: empty xattr leaf header blocks are not corruption + +empty xattr leaf blocks can be the benign byproduct of the system +going down during the multi-step process of adding a large xattr +to a file that has no xattrs. If we find one at attr fork offset 0, +we should clear it, but this isn't a corruption. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Eric Sandeen +--- + repair/attr_repair.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/repair/attr_repair.c b/repair/attr_repair.c +index 2055d96..c3a6d50 100644 +--- a/repair/attr_repair.c ++++ b/repair/attr_repair.c +@@ -579,6 +579,26 @@ process_leaf_attr_block( + firstb = mp->m_sb.sb_blocksize; + stop = xfs_attr3_leaf_hdr_size(leaf); + ++ /* ++ * Empty leaf blocks at offset zero can occur as a race between ++ * setxattr and the system going down, so we only take action if we're ++ * running in modify mode. See xfs_attr3_leaf_verify for details of ++ * how we've screwed this up many times. ++ */ ++ if (!leafhdr.count && da_bno == 0) { ++ if (no_modify) { ++ do_log( ++ _("would clear empty leaf attr block 0, inode %" PRIu64 "\n"), ++ ino); ++ return 0; ++ } ++ ++ do_warn( ++ _("will clear empty leaf attr block 0, inode %" PRIu64 "\n"), ++ ino); ++ return 1; ++ } ++ + /* does the count look sorta valid? */ + if (!leafhdr.count || + leafhdr.count * sizeof(xfs_attr_leaf_entry_t) + stop > +-- +1.8.3.1 + diff --git a/0035-xfs_repair-Search-for-conflicts-in-inode_tree_ptrs-w.patch b/0035-xfs_repair-Search-for-conflicts-in-inode_tree_ptrs-w.patch new file mode 100644 index 0000000000000000000000000000000000000000..dd45613af00429f75bb9e77ad519d960d488b665 --- /dev/null +++ b/0035-xfs_repair-Search-for-conflicts-in-inode_tree_ptrs-w.patch @@ -0,0 +1,45 @@ +From 91c1d0836aa4a228e76c0b8c5d83903f1f6bfdbb Mon Sep 17 00:00:00 2001 +From: Chandan Babu R +Date: Wed, 13 Jul 2022 20:58:27 -0500 +Subject: [PATCH] xfs_repair: Search for conflicts in inode_tree_ptrs[] when + processing uncertain inodes + +When processing an uncertain inode chunk record, if we lose 2 blocks worth of +inodes or 25% of the chunk, xfs_repair decides to ignore the chunk. Otherwise, +xfs_repair adds a new chunk record to inode_tree_ptrs[agno], marking each +inode as either free or used. However, before adding the new chunk record, +xfs_repair has to check for the existance of a conflicting record. + +The existing code incorrectly checks for the conflicting record in +inode_uncertain_tree_ptrs[agno]. This check will succeed since the inode chunk +record being processed was originally obtained from +inode_uncertain_tree_ptrs[agno]. + +This commit fixes the bug by changing xfs_repair to search +inode_tree_ptrs[agno] for conflicts. + +Signed-off-by: Chandan Babu R +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Eric Sandeen +--- + repair/dino_chunks.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c +index 11b0eb5..80c52a4 100644 +--- a/repair/dino_chunks.c ++++ b/repair/dino_chunks.c +@@ -229,8 +229,7 @@ verify_inode_chunk(xfs_mount_t *mp, + /* + * ok, put the record into the tree, if no conflict. + */ +- if (find_uncertain_inode_rec(agno, +- XFS_AGB_TO_AGINO(mp, start_agbno))) ++ if (find_inode_rec(mp, agno, XFS_AGB_TO_AGINO(mp, start_agbno))) + return(0); + + start_agino = XFS_AGB_TO_AGINO(mp, start_agbno); +-- +1.8.3.1 + diff --git a/0036-mkfs-terminate-getsubopt-arrays-properly.patch b/0036-mkfs-terminate-getsubopt-arrays-properly.patch new file mode 100644 index 0000000000000000000000000000000000000000..394904016c9ac2a9ca9d4e9e134166c878d28c99 --- /dev/null +++ b/0036-mkfs-terminate-getsubopt-arrays-properly.patch @@ -0,0 +1,119 @@ +From cdf5cfe93ee14942665f3c6ae78a8bf1198e1798 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 13 Jul 2022 20:58:28 -0500 +Subject: [PATCH] mkfs: terminate getsubopt arrays properly + +Having not drank any (or maybe too much) coffee this morning, I typed: + +$ mkfs.xfs -d agcount=3 -d nrext64=0 +Segmentation fault + +I traced this down to getsubopt walking off the end of the dopts.subopts +array. The manpage says you're supposed to terminate the suboptions +string array with a NULL entry, but the structure definition uses +MAX_SUBOPTS/D_MAX_OPTS directly, which means there is no terminator. + +Explicitly terminate each suboption array with a NULL entry after +making room for it. + +Signed-off-by: Darrick J. Wong +[sandeen: explicitly add NULL terminators & clarify comment] +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + mkfs/xfs_mkfs.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c +index fdf6d4a..5cd2f81 100644 +--- a/mkfs/xfs_mkfs.c ++++ b/mkfs/xfs_mkfs.c +@@ -132,8 +132,11 @@ enum { + M_MAX_OPTS, + }; + +-/* Just define the max options array size manually right now */ +-#define MAX_SUBOPTS D_MAX_OPTS ++/* ++ * Just define the max options array size manually to the largest ++ * enum right now, leaving room for a NULL terminator at the end ++ */ ++#define MAX_SUBOPTS (D_MAX_OPTS + 1) + + #define SUBOPT_NEEDS_VAL (-1LL) + #define MAX_CONFLICTS 8 +@@ -243,6 +246,7 @@ static struct opt_params bopts = { + .ini_section = "block", + .subopts = { + [B_SIZE] = "size", ++ [B_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = B_SIZE, +@@ -269,6 +273,7 @@ static struct opt_params copts = { + .name = 'c', + .subopts = { + [C_OPTFILE] = "options", ++ [C_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = C_OPTFILE, +@@ -298,6 +303,7 @@ static struct opt_params dopts = { + [D_EXTSZINHERIT] = "extszinherit", + [D_COWEXTSIZE] = "cowextsize", + [D_DAXINHERIT] = "daxinherit", ++ [D_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = D_AGCOUNT, +@@ -434,6 +440,7 @@ static struct opt_params iopts = { + [I_ATTR] = "attr", + [I_PROJID32BIT] = "projid32bit", + [I_SPINODES] = "sparse", ++ [I_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = I_ALIGN, +@@ -500,6 +507,7 @@ static struct opt_params lopts = { + [L_FILE] = "file", + [L_NAME] = "name", + [L_LAZYSBCNTR] = "lazy-count", ++ [L_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = L_AGNUM, +@@ -592,6 +600,7 @@ static struct opt_params nopts = { + [N_SIZE] = "size", + [N_VERSION] = "version", + [N_FTYPE] = "ftype", ++ [N_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = N_SIZE, +@@ -627,6 +636,7 @@ static struct opt_params ropts = { + [R_FILE] = "file", + [R_NAME] = "name", + [R_NOALIGN] = "noalign", ++ [R_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = R_EXTSIZE, +@@ -674,6 +684,7 @@ static struct opt_params sopts = { + .subopts = { + [S_SIZE] = "size", + [S_SECTSIZE] = "sectsize", ++ [S_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = S_SIZE, +@@ -710,6 +721,7 @@ static struct opt_params mopts = { + [M_REFLINK] = "reflink", + [M_INOBTCNT] = "inobtcount", + [M_BIGTIME] = "bigtime", ++ [M_MAX_OPTS] = NULL, + }, + .subopt_params = { + { .index = M_CRC, +-- +1.8.3.1 + diff --git a/0037-mkfs-complain-about-impossible-log-size-constraints.patch b/0037-mkfs-complain-about-impossible-log-size-constraints.patch new file mode 100644 index 0000000000000000000000000000000000000000..aa3ae6e404c1f78c463b5483b3f5ab84109fabba --- /dev/null +++ b/0037-mkfs-complain-about-impossible-log-size-constraints.patch @@ -0,0 +1,40 @@ +From db5b866537e78669f7b84590345b0c37f841f701 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 4 Aug 2022 21:28:23 -0500 +Subject: [PATCH] mkfs: complain about impossible log size constraints + +xfs/042 trips over an impossible fs geometry when nrext64 is enabled. +The minimum log size calculation comes out to 4287 blocks, but the mkfs +parameters specify an AG size of 4096 blocks. This eventually causes +mkfs to complain that the autoselected log size doesn't meet the minimum +size, but we could be a little more explicit in pointing out that the +two size constraints make for an impossible geometry. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Eric Sandeen +--- + mkfs/xfs_mkfs.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c +index 12994ed..9dd0e79 100644 +--- a/mkfs/xfs_mkfs.c ++++ b/mkfs/xfs_mkfs.c +@@ -3490,6 +3490,13 @@ _("external log device size %lld blocks too small, must be at least %lld blocks\ + * an AG. + */ + max_logblocks = libxfs_alloc_ag_max_usable(mp) - 1; ++ if (max_logblocks < min_logblocks) { ++ fprintf(stderr, ++_("max log size %d smaller than min log size %d, filesystem is too small\n"), ++ max_logblocks, ++ min_logblocks); ++ usage(); ++ } + + /* internal log - if no size specified, calculate automatically */ + if (!cfg->logblocks) { +-- +1.8.3.1 + diff --git a/0038-xfs-trim-the-mapp-array-accordingly-in-xfs_da_grow_i.patch b/0038-xfs-trim-the-mapp-array-accordingly-in-xfs_da_grow_i.patch new file mode 100644 index 0000000000000000000000000000000000000000..4ae05a405a329c117d1a62951f074f7994359dbe --- /dev/null +++ b/0038-xfs-trim-the-mapp-array-accordingly-in-xfs_da_grow_i.patch @@ -0,0 +1,53 @@ +From 04d4c27afa3f2c0088e381102e68cfb6a96b3306 Mon Sep 17 00:00:00 2001 +From: Shida Zhang +Date: Fri, 18 Nov 2022 10:46:33 +0100 +Subject: [PATCH] xfs: trim the mapp array accordingly in xfs_da_grow_inode_int + +Source kernel commit: 44159659df8ca381b84261e11058b2176fa03ba0 + +Take a look at the for-loop in xfs_da_grow_inode_int: +====== +for(){ +nmap = min(XFS_BMAP_MAX_NMAP, count); +... +error = xfs_bmapi_write(...,&mapp[mapi], &nmap);//(..., $1, $2) +... +mapi += nmap; +} +===== +where $1 stands for the start address of the array, +while $2 is used to indicate the size of the array. + +The array $1 will advance by $nmap in each iteration after +the allocation of extents. +But the size $2 still remains unchanged, which is determined by +min(XFS_BMAP_MAX_NMAP, count). + +It seems that it has forgotten to trim the mapp array after each +iteration, so change it. + +Signed-off-by: Shida Zhang +Reviewed-by: Darrick J. Wong +Signed-off-by: Dave Chinner +Signed-off-by: Carlos Maiolino +--- + libxfs/xfs_da_btree.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libxfs/xfs_da_btree.c b/libxfs/xfs_da_btree.c +index 9dc22f2..a068a01 100644 +--- a/libxfs/xfs_da_btree.c ++++ b/libxfs/xfs_da_btree.c +@@ -2188,8 +2188,8 @@ xfs_da_grow_inode_int( + */ + mapp = kmem_alloc(sizeof(*mapp) * count, 0); + for (b = *bno, mapi = 0; b < *bno + count; ) { +- nmap = min(XFS_BMAP_MAX_NMAP, count); + c = (int)(*bno + count - b); ++ nmap = min(XFS_BMAP_MAX_NMAP, c); + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + args->total, &mapp[mapi], &nmap); +-- +1.8.3.1 + diff --git a/0039-xfs-fix-exception-caused-by-unexpected-illegal-bestc.patch b/0039-xfs-fix-exception-caused-by-unexpected-illegal-bestc.patch new file mode 100644 index 0000000000000000000000000000000000000000..b7f421038eb4945d57de0ba46dac514e57eb0bbe --- /dev/null +++ b/0039-xfs-fix-exception-caused-by-unexpected-illegal-bestc.patch @@ -0,0 +1,156 @@ +From 20798cc06315ec1581b87b3da7f868dff62a6efd Mon Sep 17 00:00:00 2001 +From: Guo Xuenan +Date: Fri, 18 Nov 2022 10:48:09 +0100 +Subject: [PATCH] xfs: fix exception caused by unexpected illegal bestcount in + leaf dir + +Source kernel commit: 13cf24e00665c9751951a422756d975812b71173 + +For leaf dir, In most cases, there should be as many bestfree slots +as the dir data blocks that can fit under i_size (except for [1]). + +Root cause is we don't examin the number bestfree slots, when the slots +number less than dir data blocks, if we need to allocate new dir data +block and update the bestfree array, we will use the dir block number as +index to assign bestfree array, while we did not check the leaf buf +boundary which may cause UAF or other memory access problem. This issue +can also triggered with test cases xfs/473 from fstests. + +According to Dave Chinner & Darrick's suggestion, adding buffer verifier +to detect this abnormal situation in time. +Simplify the testcase for fstest xfs/554 [1] + +The error log is shown as follows: +================================================================== +BUG: KASAN: use-after-free in xfs_dir2_leaf_addname+0x1995/0x1ac0 +Write of size 2 at addr ffff88810168b000 by task touch/1552 +CPU: 5 PID: 1552 Comm: touch Not tainted 6.0.0-rc3+ #101 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +1.13.0-1ubuntu1.1 04/01/2014 +Call Trace: + +dump_stack_lvl+0x4d/0x66 +print_report.cold+0xf6/0x691 +kasan_report+0xa8/0x120 +xfs_dir2_leaf_addname+0x1995/0x1ac0 +xfs_dir_createname+0x58c/0x7f0 +xfs_create+0x7af/0x1010 +xfs_generic_create+0x270/0x5e0 +path_openat+0x270b/0x3450 +do_filp_open+0x1cf/0x2b0 +do_sys_openat2+0x46b/0x7a0 +do_sys_open+0xb7/0x130 +do_syscall_64+0x35/0x80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd +RIP: 0033:0x7fe4d9e9312b +Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 +75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 +f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 +RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 +RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b +RDX: 0000000000000941 RSI: 00007ffda4c17f33 RDI: 00000000ffffff9c +RBP: 00007ffda4c17f33 R08: 0000000000000000 R09: 0000000000000000 +R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941 +R13: 00007fe4d9f631a4 R14: 00007ffda4c17f33 R15: 0000000000000000 + + +The buggy address belongs to the physical page: +page:ffffea000405a2c0 refcount:0 mapcount:0 mapping:0000000000000000 +index:0x0 pfn:0x10168b +flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) +raw: 002fffff80000000 ffffea0004057788 ffffea000402dbc8 0000000000000000 +raw: 0000000000000000 0000000000170000 00000000ffffffff 0000000000000000 +page dumped because: kasan: bad access detected + +Memory state around the buggy address: +ffff88810168af00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +ffff88810168af80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +>ffff88810168b000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +^ +ffff88810168b080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +ffff88810168b100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +================================================================== +Disabling lock debugging due to kernel taint +00000000: 58 44 44 33 5b 53 35 c2 00 00 00 00 00 00 00 78 +XDD3[S5........x +XFS (sdb): Internal error xfs_dir2_data_use_free at line 1200 of file +fs/xfs/libxfs/xfs_dir2_data.c. Caller +xfs_dir2_data_use_free+0x28a/0xeb0 +CPU: 5 PID: 1552 Comm: touch Tainted: G B 6.0.0-rc3+ +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +1.13.0-1ubuntu1.1 04/01/2014 +Call Trace: + +dump_stack_lvl+0x4d/0x66 +xfs_corruption_error+0x132/0x150 +xfs_dir2_data_use_free+0x198/0xeb0 +xfs_dir2_leaf_addname+0xa59/0x1ac0 +xfs_dir_createname+0x58c/0x7f0 +xfs_create+0x7af/0x1010 +xfs_generic_create+0x270/0x5e0 +path_openat+0x270b/0x3450 +do_filp_open+0x1cf/0x2b0 +do_sys_openat2+0x46b/0x7a0 +do_sys_open+0xb7/0x130 +do_syscall_64+0x35/0x80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd +RIP: 0033:0x7fe4d9e9312b +Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 +75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 +f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 +RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 +RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b +RDX: 0000000000000941 RSI: 00007ffda4c17f46 RDI: 00000000ffffff9c +RBP: 00007ffda4c17f46 R08: 0000000000000000 R09: 0000000000000001 +R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941 +R13: 00007fe4d9f631a4 R14: 00007ffda4c17f46 R15: 0000000000000000 + +XFS (sdb): Corruption detected. Unmount and run xfs_repair + +[1] https://lore.kernel.org/all/20220928095355.2074025-1-guoxuenan@huawei.com/ +Reviewed-by: Hou Tao +Signed-off-by: Guo Xuenan +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +--- + libxfs/xfs_dir2_leaf.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/libxfs/xfs_dir2_leaf.c b/libxfs/xfs_dir2_leaf.c +index 8827c96..5da6600 100644 +--- a/libxfs/xfs_dir2_leaf.c ++++ b/libxfs/xfs_dir2_leaf.c +@@ -144,6 +144,8 @@ xfs_dir3_leaf_check_int( + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; ++ bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || ++ hdr->magic == XFS_DIR3_LEAF1_MAGIC); + + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + +@@ -156,8 +158,7 @@ xfs_dir3_leaf_check_int( + return __this_address; + + /* Leaves and bests don't overlap in leaf format. */ +- if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || +- hdr->magic == XFS_DIR3_LEAF1_MAGIC) && ++ if (isleaf1 && + (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return __this_address; + +@@ -173,6 +174,10 @@ xfs_dir3_leaf_check_int( + } + if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; ++ if (isleaf1 && xfs_dir2_dataptr_to_db(geo, ++ be32_to_cpu(hdr->ents[i].address)) >= ++ be32_to_cpu(ltp->bestcount)) ++ return __this_address; + } + if (hdr->stale != stale) + return __this_address; +-- +1.8.3.1 + diff --git a/0040-xfs-increase-rename-inode-reservation.patch b/0040-xfs-increase-rename-inode-reservation.patch new file mode 100644 index 0000000000000000000000000000000000000000..2b8a1b7ba65562392a1e35690a09514aa95b7b9e --- /dev/null +++ b/0040-xfs-increase-rename-inode-reservation.patch @@ -0,0 +1,43 @@ +From 227bc97f12f2df902ab776fe038dc6d065f03c58 Mon Sep 17 00:00:00 2001 +From: Allison Henderson +Date: Fri, 18 Nov 2022 10:48:26 +0100 +Subject: [PATCH] xfs: increase rename inode reservation + +Source kernel commit: e07ee6fe21f47cfd72ae566395c67a80e7c66163 + +xfs_rename can update up to 5 inodes: src_dp, target_dp, src_ip, target_ip +and wip. So we need to increase the inode reservation to match. + +Signed-off-by: Allison Henderson +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +--- + libxfs/xfs_trans_resv.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libxfs/xfs_trans_resv.c b/libxfs/xfs_trans_resv.c +index 797176d..04c4448 100644 +--- a/libxfs/xfs_trans_resv.c ++++ b/libxfs/xfs_trans_resv.c +@@ -421,7 +421,7 @@ xfs_calc_itruncate_reservation_minlogsize( + + /* + * In renaming a files we can modify: +- * the four inodes involved: 4 * inode size ++ * the five inodes involved: 5 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets +@@ -436,7 +436,7 @@ xfs_calc_rename_reservation( + struct xfs_mount *mp) + { + return XFS_DQUOT_LOGRES(mp) + +- max((xfs_calc_inode_res(mp, 4) + ++ max((xfs_calc_inode_res(mp, 5) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + +-- +1.8.3.1 + diff --git a/0041-xfs-fix-sb-write-verify-for-lazysbcount.patch b/0041-xfs-fix-sb-write-verify-for-lazysbcount.patch new file mode 100644 index 0000000000000000000000000000000000000000..5eb1b3b13e84577df8efc246313c55ad2bcd6b62 --- /dev/null +++ b/0041-xfs-fix-sb-write-verify-for-lazysbcount.patch @@ -0,0 +1,118 @@ +From 4b593a7b25ce7cd155614006a943ddd53ca47669 Mon Sep 17 00:00:00 2001 +From: Long Li +Date: Fri, 18 Nov 2022 12:23:57 +0100 +Subject: [PATCH] xfs: fix sb write verify for lazysbcount + +Source kernel commit: 7cecd500d90164419add650e26cc1de03a7a66cb + +When lazysbcount is enabled, fsstress and loop mount/unmount test report +the following problems: + +XFS (loop0): SB summary counter sanity check failed +XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460, +xfs_sb block 0x0 +XFS (loop0): Unmount and run xfs_repair +XFS (loop0): First 128 bytes of corrupted metadata buffer: +00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00 XFSB.........(.. +00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a i.|._.D..t....4Z +00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80 ..... .......... +00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................ +00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00 ................ +00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00 ................ +00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19 ................ +XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply ++0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580). Shutting down filesystem. +XFS (loop0): Please unmount the filesystem and rectify the problem(s) +XFS (loop0): log mount/recovery failed: error -117 +XFS (loop0): log mount failed + +This corruption will shutdown the file system and the file system will +no longer be mountable. The following script can reproduce the problem, +but it may take a long time. + +#!/bin/bash + +device=/dev/sda +testdir=/mnt/test +round=0 + +function fail() +{ +echo "$*" +exit 1 +} + +mkdir -p $testdir +while [ $round -lt 10000 ] +do +echo "******* round $round ********" +mkfs.xfs -f $device +mount $device $testdir || fail "mount failed!" +fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null & +sleep 4 +killall -w fsstress +umount $testdir +xfs_repair -e $device > /dev/null +if [ $? -eq 2 ];then +echo "ERR CODE 2: Dirty log exception during repair." +exit 1 +fi +round=$(($round+1)) +done + +With lazysbcount is enabled, There is no additional lock protection for +reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the +m_ifree, this will make the m_ifree greater than m_icount. For example, +consider the following sequence and ifreedelta is postive: + +CPU0 CPU1 +xfs_log_sb xfs_trans_unreserve_and_mod_sb +---------- ------------------------------ +percpu_counter_sum(&mp->m_icount) +percpu_counter_add_batch(&mp->m_icount, +idelta, XFS_ICOUNT_BATCH) +percpu_counter_add(&mp->m_ifree, ifreedelta); +percpu_counter_sum(&mp->m_ifree) + +After this, incorrect inode count (sb_ifree > sb_icount) will be writen to +the log. In the subsequent writing of sb, incorrect inode count (sb_ifree > +sb_icount) will fail to pass the boundary check in xfs_validate_sb_write() +that cause the file system shutdown. + +When lazysbcount is enabled, we don't need to guarantee that Lazy sb +counters are completely correct, but we do need to guarantee that sb_ifree +<= sb_icount. On the other hand, the constraint that m_ifree <= m_icount +must be satisfied any time that there /cannot/ be other threads allocating +or freeing inode chunks. If the constraint is violated under these +circumstances, sb_i{count,free} (the ondisk superblock inode counters) +maybe incorrect and need to be marked sick at unmount, the count will +be rebuilt on the next mount. + +Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks") +Signed-off-by: Long Li +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +--- + libxfs/xfs_sb.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c +index cfa44eb..624bfbf 100644 +--- a/libxfs/xfs_sb.c ++++ b/libxfs/xfs_sb.c +@@ -804,7 +804,9 @@ xfs_log_sb( + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) { + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); +- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); ++ mp->m_sb.sb_ifree = min_t(uint64_t, ++ percpu_counter_sum(&mp->m_ifree), ++ mp->m_sb.sb_icount); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + } + +-- +1.8.3.1 + diff --git a/0042-xfs_repair-don-t-crash-on-unknown-inode-parents-in-d.patch b/0042-xfs_repair-don-t-crash-on-unknown-inode-parents-in-d.patch new file mode 100644 index 0000000000000000000000000000000000000000..67d7dab813151538ed8fc672d37a47e5549ecb97 --- /dev/null +++ b/0042-xfs_repair-don-t-crash-on-unknown-inode-parents-in-d.patch @@ -0,0 +1,58 @@ +From 978c3087b6afa56986ac3e5a52131d73d28253ca Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 23 Nov 2022 09:09:28 -0800 +Subject: [PATCH] xfs_repair: don't crash on unknown inode parents in dry run + mode + +Fuzz testing of directory block headers exposed a debug assertion vector +in xfs_repair. In normal (aka fixit) mode, if a single-block directory +has a totally trashed block, repair will zap the entire directory. +Phase 4 ignores any dirents pointing to the zapped directory, phase 6 +ignores the freed directory, and everything is good. + +However, in dry run mode, we don't actually free the inode. Phase 4 +still ignores any dirents pointing to the zapped directory, but phase 6 +thinks the inode is still live and tries to walk it. xfs_repair doesn't +know of any parents for the zapped directory and so trips the assertion. + +The assertion is critical for fixit mode because we need all the parent +information to ensure consistency of the directory tree. In dry run +mode we don't care, because we only have to print inconsistencies and +return 1. Worse yet, (our) customers file bugs when xfs_repair crashes +during a -n scan, so this will generate support calls. + +Make everyone's life easier by downgrading the assertion to a warning if +we're running in dry run mode. + +Found by fuzzing bhdr.hdr.bno = zeroes in xfs/471. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Carlos Maiolino +Signed-off-by: Carlos Maiolino +--- + repair/phase6.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/repair/phase6.c b/repair/phase6.c +index 1f9f8de..0be2c9c 100644 +--- a/repair/phase6.c ++++ b/repair/phase6.c +@@ -1836,7 +1836,14 @@ longform_dir2_entry_check_data( + continue; + } + parent = get_inode_parent(irec, ino_offset); +- ASSERT(parent != 0); ++ if (parent == 0) { ++ if (no_modify) ++ do_warn( ++ _("unknown parent for inode %" PRIu64 "\n"), ++ inum); ++ else ++ ASSERT(parent != 0); ++ } + junkit = 0; + /* + * bump up the link counts in parent and child +-- +1.8.3.1 + diff --git a/0043-xfs_repair-retain-superblock-buffer-to-avoid-write-h.patch b/0043-xfs_repair-retain-superblock-buffer-to-avoid-write-h.patch new file mode 100644 index 0000000000000000000000000000000000000000..c84ff6e2753d042c1a6f692fad506db4f711356d --- /dev/null +++ b/0043-xfs_repair-retain-superblock-buffer-to-avoid-write-h.patch @@ -0,0 +1,275 @@ +From a5915eb4be5c2070adce092e6fff1fd9c906dc7e Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 23 Nov 2022 09:09:33 -0800 +Subject: [PATCH] xfs_repair: retain superblock buffer to avoid write hook + deadlock + +Every now and then I experience the following deadlock in xfs_repair +when I'm running the offline repair fuzz tests: + +#0 futex_wait (private=0, expected=2, futex_word=0x55555566df70) at ../sysdeps/nptl/futex-internal.h:146 +#1 __GI___lll_lock_wait (futex=futex@entry=0x55555566df70, private=0) at ./nptl/lowlevellock.c:49 +#2 lll_mutex_lock_optimized (mutex=0x55555566df70) at ./nptl/pthread_mutex_lock.c:48 +#3 ___pthread_mutex_lock (mutex=mutex@entry=0x55555566df70) at ./nptl/pthread_mutex_lock.c:93 +#4 cache_shake (cache=cache@entry=0x55555566de60, priority=priority@entry=2, purge=purge@entry=false) at cache.c:231 +#5 cache_node_get (cache=cache@entry=0x55555566de60, key=key@entry=0x7fffe55e01b0, nodep=nodep@entry=0x7fffe55e0168) at cache.c:452 +#6 __cache_lookup (key=key@entry=0x7fffe55e01b0, flags=0, bpp=bpp@entry=0x7fffe55e0228) at rdwr.c:405 +#7 libxfs_getbuf_flags (btp=0x55555566de00, blkno=0, len=, flags=, bpp=0x7fffe55e0228) at rdwr.c:457 +#8 libxfs_buf_read_map (btp=0x55555566de00, map=map@entry=0x7fffe55e0280, nmaps=nmaps@entry=1, flags=flags@entry=0, bpp=bpp@entry=0x7fffe55e0278, ops=0x5555556233e0 ) + at rdwr.c:704 +#9 libxfs_buf_read (ops=, bpp=0x7fffe55e0278, flags=0, numblks=, blkno=0, target=) + at /storage/home/djwong/cdev/work/xfsprogs/build-x86_64/libxfs/libxfs_io.h:195 +#10 libxfs_getsb (mp=mp@entry=0x7fffffffd690) at rdwr.c:162 +#11 force_needsrepair (mp=0x7fffffffd690) at xfs_repair.c:924 +#12 repair_capture_writeback (bp=) at xfs_repair.c:1000 +#13 libxfs_bwrite (bp=0x7fffe011e530) at rdwr.c:869 +#14 cache_shake (cache=cache@entry=0x55555566de60, priority=priority@entry=2, purge=purge@entry=false) at cache.c:240 +#15 cache_node_get (cache=cache@entry=0x55555566de60, key=key@entry=0x7fffe55e0470, nodep=nodep@entry=0x7fffe55e0428) at cache.c:452 +#16 __cache_lookup (key=key@entry=0x7fffe55e0470, flags=1, bpp=bpp@entry=0x7fffe55e0538) at rdwr.c:405 +#17 libxfs_getbuf_flags (btp=0x55555566de00, blkno=12736, len=, flags=, bpp=0x7fffe55e0538) at rdwr.c:457 +#18 __libxfs_buf_get_map (btp=, map=map@entry=0x7fffe55e05b0, nmaps=, flags=flags@entry=1, bpp=bpp@entry=0x7fffe55e0538) at rdwr.c:501 +#19 libxfs_buf_get_map (btp=, map=map@entry=0x7fffe55e05b0, nmaps=, flags=flags@entry=1, bpp=bpp@entry=0x7fffe55e0538) at rdwr.c:525 +#20 pf_queue_io (args=args@entry=0x5555556722c0, map=map@entry=0x7fffe55e05b0, nmaps=, flag=flag@entry=11) at prefetch.c:124 +#21 pf_read_bmbt_reclist (args=0x5555556722c0, rp=, numrecs=78) at prefetch.c:220 +#22 pf_scan_lbtree (dbno=dbno@entry=1211, level=level@entry=1, isadir=isadir@entry=1, args=args@entry=0x5555556722c0, func=0x55555557f240 ) at prefetch.c:298 +#23 pf_read_btinode (isadir=1, dino=, args=0x5555556722c0) at prefetch.c:385 +#24 pf_read_inode_dirs (args=args@entry=0x5555556722c0, bp=bp@entry=0x7fffdc023790) at prefetch.c:459 +#25 pf_read_inode_dirs (bp=, args=0x5555556722c0) at prefetch.c:411 +#26 pf_batch_read (args=args@entry=0x5555556722c0, which=which@entry=PF_PRIMARY, buf=buf@entry=0x7fffd001d000) at prefetch.c:609 +#27 pf_io_worker (param=0x5555556722c0) at prefetch.c:673 +#28 start_thread (arg=) at ./nptl/pthread_create.c:442 +#29 clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 + +>From this stack trace, we see that xfs_repair's prefetch module is +getting some xfs_buf objects ahead of initiating a read (#19). The +buffer cache has hit its limit, so it calls cache_shake (#14) to free +some unused xfs_bufs. The buffer it finds is a dirty buffer, so it +calls libxfs_bwrite to flush it out to disk, which in turn invokes the +buffer write hook that xfs_repair set up in 3b7667cb to mark the ondisk +filesystem's superblock as NEEDSREPAIR until repair actually completes. + +Unfortunately, the NEEDSREPAIR handler itself needs to grab the +superblock buffer, so it makes another call into the buffer cache (#9), +which sees that the cache is full and tries to shake it(#4). Hence we +deadlock on cm_mutex because shaking is not reentrant. + +Fix this by retaining a reference to the superblock buffer when possible +so that the writeback hook doesn't have to access the buffer cache to +set NEEDSREPAIR. + +Fixes: 3b7667cb ("xfs_repair: set NEEDSREPAIR the first time we write to a filesystem") +Signed-off-by: Darrick J. Wong +Reviewed-by: Carlos Maiolino +Signed-off-by: Carlos Maiolino +--- + libxfs/libxfs_api_defs.h | 2 ++ + libxfs/libxfs_io.h | 3 ++ + libxfs/rdwr.c | 16 +++++++++++ + repair/phase2.c | 8 ++++++ + repair/protos.h | 1 + + repair/xfs_repair.c | 75 ++++++++++++++++++++++++++++++++++++++++++------ + 6 files changed, 96 insertions(+), 9 deletions(-) + +diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h +index 9aea8c7..4fe4e75 100644 +--- a/libxfs/libxfs_api_defs.h ++++ b/libxfs/libxfs_api_defs.h +@@ -49,9 +49,11 @@ + #define xfs_buf_delwri_submit libxfs_buf_delwri_submit + #define xfs_buf_get libxfs_buf_get + #define xfs_buf_get_uncached libxfs_buf_get_uncached ++#define xfs_buf_lock libxfs_buf_lock + #define xfs_buf_read libxfs_buf_read + #define xfs_buf_read_uncached libxfs_buf_read_uncached + #define xfs_buf_relse libxfs_buf_relse ++#define xfs_buf_unlock libxfs_buf_unlock + #define xfs_bunmapi libxfs_bunmapi + #define xfs_bwrite libxfs_bwrite + #define xfs_calc_dquots_per_chunk libxfs_calc_dquots_per_chunk +diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h +index 3cc4f4e..0e444e2 100644 +--- a/libxfs/libxfs_io.h ++++ b/libxfs/libxfs_io.h +@@ -217,6 +217,9 @@ xfs_buf_hold(struct xfs_buf *bp) + bp->b_node.cn_count++; + } + ++void xfs_buf_lock(struct xfs_buf *bp); ++void xfs_buf_unlock(struct xfs_buf *bp); ++ + int libxfs_buf_get_uncached(struct xfs_buftarg *targ, size_t bblen, int flags, + struct xfs_buf **bpp); + int libxfs_buf_read_uncached(struct xfs_buftarg *targ, xfs_daddr_t daddr, +diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c +index 128367e..2c1162b 100644 +--- a/libxfs/rdwr.c ++++ b/libxfs/rdwr.c +@@ -376,6 +376,22 @@ libxfs_getbufr_map(struct xfs_buftarg *btp, xfs_daddr_t blkno, int bblen, + return bp; + } + ++void ++xfs_buf_lock( ++ struct xfs_buf *bp) ++{ ++ if (use_xfs_buf_lock) ++ pthread_mutex_lock(&bp->b_lock); ++} ++ ++void ++xfs_buf_unlock( ++ struct xfs_buf *bp) ++{ ++ if (use_xfs_buf_lock) ++ pthread_mutex_unlock(&bp->b_lock); ++} ++ + static int + __cache_lookup( + struct xfs_bufkey *key, +diff --git a/repair/phase2.c b/repair/phase2.c +index ab53ee0..7441451 100644 +--- a/repair/phase2.c ++++ b/repair/phase2.c +@@ -250,6 +250,14 @@ phase2( + } else + do_log(_("Phase 2 - using internal log\n")); + ++ /* ++ * Now that we've set up the buffer cache the way we want it, try to ++ * grab our own reference to the primary sb so that the hooks will not ++ * have to call out to the buffer cache. ++ */ ++ if (mp->m_buf_writeback_fn) ++ retain_primary_sb(mp); ++ + /* Zero log if applicable */ + do_log(_(" - zero log...\n")); + +diff --git a/repair/protos.h b/repair/protos.h +index 83734e8..7cdc3a1 100644 +--- a/repair/protos.h ++++ b/repair/protos.h +@@ -16,6 +16,7 @@ int get_sb(xfs_sb_t *sbp, + xfs_off_t off, + int size, + xfs_agnumber_t agno); ++int retain_primary_sb(struct xfs_mount *mp); + void write_primary_sb(xfs_sb_t *sbp, + int size); + +diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c +index e44aa40..d043643 100644 +--- a/repair/xfs_repair.c ++++ b/repair/xfs_repair.c +@@ -738,6 +738,63 @@ check_fs_vs_host_sectsize( + } + } + ++/* ++ * If we set up a writeback function to set NEEDSREPAIR while the filesystem is ++ * dirty, there's a chance that calling libxfs_getsb could deadlock the buffer ++ * cache while trying to get the primary sb buffer if the first non-sb write to ++ * the filesystem is the result of a cache shake. Retain a reference to the ++ * primary sb buffer to avoid all that. ++ */ ++static struct xfs_buf *primary_sb_bp; /* buffer for superblock */ ++ ++int ++retain_primary_sb( ++ struct xfs_mount *mp) ++{ ++ int error; ++ ++ error = -libxfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, ++ XFS_FSS_TO_BB(mp, 1), 0, &primary_sb_bp, ++ &xfs_sb_buf_ops); ++ if (error) ++ return error; ++ ++ libxfs_buf_unlock(primary_sb_bp); ++ return 0; ++} ++ ++static void ++drop_primary_sb(void) ++{ ++ if (!primary_sb_bp) ++ return; ++ ++ libxfs_buf_lock(primary_sb_bp); ++ libxfs_buf_relse(primary_sb_bp); ++ primary_sb_bp = NULL; ++} ++ ++static int ++get_primary_sb( ++ struct xfs_mount *mp, ++ struct xfs_buf **bpp) ++{ ++ int error; ++ ++ *bpp = NULL; ++ ++ if (!primary_sb_bp) { ++ error = retain_primary_sb(mp); ++ if (error) ++ return error; ++ } ++ ++ libxfs_buf_lock(primary_sb_bp); ++ xfs_buf_hold(primary_sb_bp); ++ *bpp = primary_sb_bp; ++ return 0; ++} ++ + /* Clear needsrepair after a successful repair run. */ + void + clear_needsrepair( +@@ -758,15 +815,14 @@ clear_needsrepair( + do_warn( + _("Cannot clear needsrepair due to flush failure, err=%d.\n"), + error); +- return; ++ goto drop; + } + + /* Clear needsrepair from the superblock. */ +- bp = libxfs_getsb(mp); +- if (!bp || bp->b_error) { ++ error = get_primary_sb(mp, &bp); ++ if (error) { + do_warn( +- _("Cannot clear needsrepair from primary super, err=%d.\n"), +- bp ? bp->b_error : ENOMEM); ++ _("Cannot clear needsrepair from primary super, err=%d.\n"), error); + } else { + mp->m_sb.sb_features_incompat &= + ~XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR; +@@ -775,6 +831,8 @@ clear_needsrepair( + } + if (bp) + libxfs_buf_relse(bp); ++drop: ++ drop_primary_sb(); + } + + static void +@@ -797,11 +855,10 @@ force_needsrepair( + xfs_sb_version_needsrepair(&mp->m_sb)) + return; + +- bp = libxfs_getsb(mp); +- if (!bp || bp->b_error) { ++ error = get_primary_sb(mp, &bp); ++ if (error) { + do_log( +- _("couldn't get superblock to set needsrepair, err=%d\n"), +- bp ? bp->b_error : ENOMEM); ++ _("couldn't get superblock to set needsrepair, err=%d\n"), error); + } else { + /* + * It's possible that we need to set NEEDSREPAIR before we've +-- +1.8.3.1 + diff --git a/0044-fsck.xfs-mount-umount-xfs-fs-to-replay-log-before-ru.patch b/0044-fsck.xfs-mount-umount-xfs-fs-to-replay-log-before-ru.patch new file mode 100644 index 0000000000000000000000000000000000000000..0b01d31650c2b3b8dbd4d12d4e315d891ff94549 --- /dev/null +++ b/0044-fsck.xfs-mount-umount-xfs-fs-to-replay-log-before-ru.patch @@ -0,0 +1,90 @@ +From 79ba1e15d80eba3aff4396f44629eb8960722d36 Mon Sep 17 00:00:00 2001 +From: Srikanth C S +Date: Tue, 13 Dec 2022 22:45:43 +0530 +Subject: [PATCH] fsck.xfs: mount/umount xfs fs to replay log before running + xfs_repair + +After a recent data center crash, we had to recover root filesystems +on several thousands of VMs via a boot time fsck. Since these +machines are remotely manageable, support can inject the kernel +command line with 'fsck.mode=force fsck.repair=yes' to kick off +xfs_repair if the machine won't come up or if they suspect there +might be deeper issues with latent errors in the fs metadata, which +is what they did to try to get everyone running ASAP while +anticipating any future problems. But, fsck.xfs does not address the +journal replay in case of a crash. + +fsck.xfs does xfs_repair -e if fsck.mode=force is set. It is +possible that when the machine crashes, the fs is in inconsistent +state with the journal log not yet replayed. This can drop the machine +into the rescue shell because xfs_fsck.sh does not know how to clean the +log. Since the administrator told us to force repairs, address the +deficiency by cleaning the log and rerunning xfs_repair. + +Run xfs_repair -e when fsck.mode=force and repair=auto or yes. +Replay the logs only if fsck.mode=force and fsck.repair=yes. For +other option -fa and -f drop to the rescue shell if repair detects +any corruptions. + +Signed-off-by: Srikanth C S +Reviewed-by: Carlos Maiolino +Signed-off-by: Carlos Maiolino +--- + fsck/xfs_fsck.sh | 31 +++++++++++++++++++++++++++++-- + 1 file changed, 29 insertions(+), 2 deletions(-) + +diff --git a/fsck/xfs_fsck.sh b/fsck/xfs_fsck.sh +index 6af0f22..62a1e0b 100755 +--- a/fsck/xfs_fsck.sh ++++ b/fsck/xfs_fsck.sh +@@ -31,10 +31,12 @@ repair2fsck_code() { + + AUTO=false + FORCE=false ++REPAIR=false + while getopts ":aApyf" c + do + case $c in +- a|A|p|y) AUTO=true;; ++ a|A|p) AUTO=true;; ++ y) REPAIR=true;; + f) FORCE=true;; + esac + done +@@ -64,7 +66,32 @@ fi + + if $FORCE; then + xfs_repair -e $DEV +- repair2fsck_code $? ++ error=$? ++ if [ $error -eq 2 ] && [ $REPAIR = true ]; then ++ echo "Replaying log for $DEV" ++ mkdir -p /tmp/repair_mnt || exit 1 ++ for x in $(cat /proc/cmdline); do ++ case $x in ++ root=*) ++ ROOT="${x#root=}" ++ ;; ++ rootflags=*) ++ ROOTFLAGS="-o ${x#rootflags=}" ++ ;; ++ esac ++ done ++ test -b "$ROOT" || ROOT=$(blkid -t "$ROOT" -o device) ++ if [ $(basename $DEV) = $(basename $ROOT) ]; then ++ mount $DEV /tmp/repair_mnt $ROOTFLAGS || exit 1 ++ else ++ mount $DEV /tmp/repair_mnt || exit 1 ++ fi ++ umount /tmp/repair_mnt ++ xfs_repair -e $DEV ++ error=$? ++ rm -d /tmp/repair_mnt ++ fi ++ repair2fsck_code $error + exit $? + fi + +-- +1.8.3.1 + diff --git a/0045-xfs_db-fix-dir3-block-magic-check.patch b/0045-xfs_db-fix-dir3-block-magic-check.patch new file mode 100644 index 0000000000000000000000000000000000000000..987f69f2b701d6392c943045f0fba8dea4b62dc0 --- /dev/null +++ b/0045-xfs_db-fix-dir3-block-magic-check.patch @@ -0,0 +1,33 @@ +From 7374f58bfeb38467bab6552a47a5cd6bbe3c2e2e Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 20 Dec 2022 16:53:34 -0800 +Subject: [PATCH] xfs_db: fix dir3 block magic check + +Fix this broken check, which (amazingly) went unnoticed until I cranked +up the warning level /and/ built the system for s390x. + +Fixes: e96864ff4d4 ("xfs_db: enable blockget for v5 filesystems") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Reviewed-by: Carlos Maiolino +Signed-off-by: Carlos Maiolino +--- + db/check.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/db/check.c b/db/check.c +index bb27ce5..964756d 100644 +--- a/db/check.c ++++ b/db/check.c +@@ -2578,7 +2578,7 @@ process_data_dir_v2( + error++; + } + if ((be32_to_cpu(data->magic) == XFS_DIR2_BLOCK_MAGIC || +- be32_to_cpu(data->magic) == XFS_DIR2_BLOCK_MAGIC) && ++ be32_to_cpu(data->magic) == XFS_DIR3_BLOCK_MAGIC) && + stale != be32_to_cpu(btp->stale)) { + if (!sflag || v) + dbprintf(_("dir %lld block %d bad stale tail count %d\n"), +-- +1.8.3.1 + diff --git a/0046-xfs_repair-fix-incorrect-dabtree-hashval-comparison.patch b/0046-xfs_repair-fix-incorrect-dabtree-hashval-comparison.patch new file mode 100644 index 0000000000000000000000000000000000000000..97ad2c927b35fa1417b8bb29656014a4ebea1951 --- /dev/null +++ b/0046-xfs_repair-fix-incorrect-dabtree-hashval-comparison.patch @@ -0,0 +1,36 @@ +From b7b81f336ac02f4e4f24e0844a7fb3023c489667 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Tue, 14 Mar 2023 18:01:55 -0700 +Subject: [PATCH] xfs_repair: fix incorrect dabtree hashval comparison + +If an xattr structure contains enough names with the same hash value to +fill multiple xattr leaf blocks with names all hashing to the same +value, then the dabtree nodes will contain consecutive entries with the +same hash value. + +This causes false corruption reports in xfs_repair because it's not +expecting such a huge same-hashing structure. Fix that. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Carlos Maiolino +Signed-off-by: Carlos Maiolino +--- + repair/da_util.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/repair/da_util.c b/repair/da_util.c +index 7239c2e..b229422 100644 +--- a/repair/da_util.c ++++ b/repair/da_util.c +@@ -330,7 +330,7 @@ _("%s block used/count inconsistency - %d/%hu\n"), + /* + * hash values monotonically increasing ??? + */ +- if (cursor->level[this_level].hashval >= ++ if (cursor->level[this_level].hashval > + be32_to_cpu(nodehdr.btree[entry].hashval)) { + do_warn( + _("%s block hashvalue inconsistency, expected > %u / saw %u\n"), +-- +1.8.3.1 + diff --git a/xfsprogs.spec b/xfsprogs.spec index 5faa62305a7d6300b4c573ea237f609267c56f9e..aeba965725587a608faab2af8da6e86a3e1ce130 100644 --- a/xfsprogs.spec +++ b/xfsprogs.spec @@ -1,6 +1,6 @@ Name: xfsprogs Version: 5.14.1 -Release: 14 +Release: 15 Summary: Administration and debugging tools for the XFS file system License: GPL+ and LGPLv2+ URL: https://xfs.wiki.kernel.org @@ -29,7 +29,43 @@ Patch7: 0007-libxcmd-add-return-value-check-for-dynamic-memory-fu.patch Patch8: 0008-xfs_repair-fix-the-problem-of-repair-failure-caused-.patch Patch9: 0009-mkfs.xfs-fix-segmentation-fault-caused-by-accessing-.patch Patch10: 0010-xfs_repair-fix-warn-in-xfs_buf_find-when-growfs-fails.patch -Patch11: 0011-xfs_copy-don-t-use-cached-buffer-reads-until-after-l.patch +Patch11: 0011-xfs_copy-don-t-use-cached-buffer-reads-until-after-l.patch +Patch12: 0012-xfs-sb-verifier-doesn-t-handle-uncached-sb-buffer.patch +Patch13: 0013-libxfs-always-initialize-internal-buffer-map.patch +Patch14: 0014-libxfs-shut-down-filesystem-if-we-xfs_trans_cancel-w.patch +Patch15: 0015-xfs_db-fix-nbits-parameter-in-fa_ino-48-functions.patch +Patch16: 0016-xfs_repair-update-secondary-superblocks-after-changi.patch +Patch17: 0017-xfs_repair-fix-AG-header-btree-level-comparisons.patch +Patch18: 0018-xfs-fix-maxlevels-comparisons-in-the-btree-staging-c.patch +Patch19: 0019-xfs-fold-perag-loop-iteration-logic-into-helper-func.patch +Patch20: 0020-xfs-rename-the-next_agno-perag-iteration-variable.patch +Patch21: 0021-xfs-terminate-perag-iteration-reliably-on-agcount.patch +Patch22: 0022-xfs-fix-perag-reference-leak-on-iteration-race-with-.patch +Patch23: 0023-mkfs-fix-missing-validation-of-l-size-against-maximu.patch +Patch24: 0024-mkfs-reduce-internal-log-size-when-log-stripe-units-.patch +Patch25: 0025-mkfs-don-t-let-internal-logs-bump-the-root-dir-inode.patch +Patch26: 0026-mkfs-improve-log-extent-validation.patch +Patch27: 0027-xfs_repair-detect-v5-featureset-mismatches-in-second.patch +Patch28: 0028-xfs_repair-check-the-ftype-of-dot-and-dotdot-directo.patch +Patch29: 0029-mkfs-Fix-memory-leak.patch +Patch30: 0030-xfs-zero-inode-fork-buffer-at-allocation.patch +Patch31: 0031-xfs-detect-self-referencing-btree-sibling-pointers.patch +Patch32: 0032-xfs-validate-inode-fork-size-against-fork-format.patch +Patch33: 0033-xfs_repair-always-rewrite-secondary-supers-when-need.patch +Patch34: 0034-xfs_repair-ignore-empty-xattr-leaf-blocks.patch +Patch35: 0035-xfs_repair-Search-for-conflicts-in-inode_tree_ptrs-w.patch +Patch36: 0036-mkfs-terminate-getsubopt-arrays-properly.patch +Patch37: 0037-mkfs-complain-about-impossible-log-size-constraints.patch +Patch38: 0038-xfs-trim-the-mapp-array-accordingly-in-xfs_da_grow_i.patch +Patch39: 0039-xfs-fix-exception-caused-by-unexpected-illegal-bestc.patch +Patch40: 0040-xfs-increase-rename-inode-reservation.patch +Patch41: 0041-xfs-fix-sb-write-verify-for-lazysbcount.patch +Patch42: 0042-xfs_repair-don-t-crash-on-unknown-inode-parents-in-d.patch +Patch43: 0043-xfs_repair-retain-superblock-buffer-to-avoid-write-h.patch +Patch44: 0044-fsck.xfs-mount-umount-xfs-fs-to-replay-log-before-ru.patch +Patch45: 0045-xfs_db-fix-dir3-block-magic-check.patch +Patch46: 0046-xfs_repair-fix-incorrect-dabtree-hashval-comparison.patch + %description xfsprogs are the userspace utilities that manage XFS filesystems. @@ -113,6 +149,9 @@ rm -rf %{buildroot}%{_datadir}/doc/xfsprogs/ %changelog +* Wed Dec 27 2023 wuguanghao - 5.14.1-15 +- backport patches from community + * Fri Nov 3 2023 wuguanghao - 5.14.1-14 - xfs_copy: don't use cached buffer reads until after libxfs_mount