WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit c0c4866

Browse files
amotintonyhutter
authored andcommitted
dmu: Allow buffer fills to fail
When ZFS overwrites a whole block, it does not bother to read the old content from disk. It is a good optimization, but if the buffer fill fails due to page fault or something else, the buffer ends up corrupted, neither keeping old content, nor getting the new one. On FreeBSD this is additionally complicated by page faults being blocked by VFS layer, always returning EFAULT on attempt to write from mmap()'ed but not yet cached address range. Normally it is not a big problem, since after original failure VFS will retry the write after reading the required data. The problem becomes worse in specific case when somebody tries to write into a file its own mmap()'ed content from the same location. In that situation the only copy of the data is getting corrupted on the page fault and the following retries only fixate the status quo. Block cloning makes this issue easier to reproduce, since it does not read the old data, unlike traditional file copy, that may work by chance. This patch provides the fill status to dmu_buf_fill_done(), that in case of error can destroy the corrupted buffer as if no write happened. One more complication in case of block cloning is that if error is possible during fill, dmu_buf_will_fill() must read the data via fall-back to dmu_buf_will_dirty(). It is required to allow in case of error restoring the buffer to a state after the cloning, not not before it, that would happen if we just call dbuf_undirty(). Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Rob Norris <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #15665
1 parent b62fd2c commit c0c4866

File tree

8 files changed

+41
-29
lines changed

8 files changed

+41
-29
lines changed

include/os/freebsd/spl/sys/uio.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
6262
}
6363

6464
static inline void
65-
zfs_uio_advance(zfs_uio_t *uio, size_t size)
65+
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
6666
{
6767
zfs_uio_resid(uio) -= size;
6868
zfs_uio_offset(uio) += size;

include/os/linux/spl/sys/uio.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
9595
}
9696

9797
static inline void
98-
zfs_uio_advance(zfs_uio_t *uio, size_t size)
98+
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
9999
{
100100
uio->uio_resid -= size;
101101
uio->uio_loffset += size;

include/sys/dbuf.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,8 +379,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
379379
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
380380
void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx);
381381
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
382-
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
383-
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
382+
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
383+
boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
384384
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
385385
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
386386
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,

lib/libspl/include/sys/uio.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len)
9090
}
9191

9292
static inline void
93-
zfs_uio_advance(zfs_uio_t *uio, size_t size)
93+
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
9494
{
9595
uio->uio_resid -= size;
9696
uio->uio_loffset += size;

module/os/freebsd/zfs/dmu_os.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
110110
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
111111

112112
if (tocpy == db->db_size)
113-
dmu_buf_will_fill(db, tx);
113+
dmu_buf_will_fill(db, tx, B_FALSE);
114114
else
115115
dmu_buf_will_dirty(db, tx);
116116

@@ -126,7 +126,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
126126
}
127127

128128
if (tocpy == db->db_size)
129-
dmu_buf_fill_done(db, tx);
129+
dmu_buf_fill_done(db, tx, B_FALSE);
130130

131131
offset += tocpy;
132132
size -= tocpy;

module/zfs/dbuf.c

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2734,7 +2734,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
27342734
}
27352735

27362736
void
2737-
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2737+
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
27382738
{
27392739
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
27402740

@@ -2752,8 +2752,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
27522752
* Block cloning: We will be completely overwriting a block
27532753
* cloned in this transaction group, so let's undirty the
27542754
* pending clone and mark the block as uncached. This will be
2755-
* as if the clone was never done.
2755+
* as if the clone was never done. But if the fill can fail
2756+
* we should have a way to return back to the cloned data.
27562757
*/
2758+
if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
2759+
mutex_exit(&db->db_mtx);
2760+
dmu_buf_will_dirty(db_fake, tx);
2761+
return;
2762+
}
27572763
VERIFY(!dbuf_undirty(db, tx));
27582764
db->db_state = DB_UNCACHED;
27592765
}
@@ -2814,32 +2820,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
28142820
dl->dr_overridden_by.blk_birth = dr->dr_txg;
28152821
}
28162822

2817-
void
2818-
dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
2823+
boolean_t
2824+
dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
28192825
{
28202826
(void) tx;
28212827
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2822-
dbuf_states_t old_state;
28232828
mutex_enter(&db->db_mtx);
28242829
DBUF_VERIFY(db);
28252830

2826-
old_state = db->db_state;
2827-
db->db_state = DB_CACHED;
2828-
if (old_state == DB_FILL) {
2831+
if (db->db_state == DB_FILL) {
28292832
if (db->db_level == 0 && db->db_freed_in_flight) {
28302833
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
28312834
/* we were freed while filling */
28322835
/* XXX dbuf_undirty? */
28332836
memset(db->db.db_data, 0, db->db.db_size);
28342837
db->db_freed_in_flight = FALSE;
2838+
db->db_state = DB_CACHED;
28352839
DTRACE_SET_STATE(db,
28362840
"fill done handling freed in flight");
2841+
failed = B_FALSE;
2842+
} else if (failed) {
2843+
VERIFY(!dbuf_undirty(db, tx));
2844+
db->db_buf = NULL;
2845+
dbuf_clear_data(db);
2846+
DTRACE_SET_STATE(db, "fill failed");
28372847
} else {
2848+
db->db_state = DB_CACHED;
28382849
DTRACE_SET_STATE(db, "fill done");
28392850
}
28402851
cv_broadcast(&db->db_changed);
2852+
} else {
2853+
db->db_state = DB_CACHED;
2854+
failed = B_FALSE;
28412855
}
28422856
mutex_exit(&db->db_mtx);
2857+
return (failed);
28432858
}
28442859

28452860
void
@@ -2984,7 +2999,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
29842999
DTRACE_SET_STATE(db, "filling assigned arcbuf");
29853000
mutex_exit(&db->db_mtx);
29863001
(void) dbuf_dirty(db, tx);
2987-
dmu_buf_fill_done(&db->db, tx);
3002+
dmu_buf_fill_done(&db->db, tx, B_FALSE);
29883003
}
29893004

29903005
void

module/zfs/dmu.c

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,14 +1115,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
11151115
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
11161116

11171117
if (tocpy == db->db_size)
1118-
dmu_buf_will_fill(db, tx);
1118+
dmu_buf_will_fill(db, tx, B_FALSE);
11191119
else
11201120
dmu_buf_will_dirty(db, tx);
11211121

11221122
(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
11231123

11241124
if (tocpy == db->db_size)
1125-
dmu_buf_fill_done(db, tx);
1125+
dmu_buf_fill_done(db, tx, B_FALSE);
11261126

11271127
offset += tocpy;
11281128
size -= tocpy;
@@ -1330,27 +1330,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
13301330

13311331
ASSERT(size > 0);
13321332

1333-
bufoff = zfs_uio_offset(uio) - db->db_offset;
1333+
offset_t off = zfs_uio_offset(uio);
1334+
bufoff = off - db->db_offset;
13341335
tocpy = MIN(db->db_size - bufoff, size);
13351336

13361337
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
13371338

13381339
if (tocpy == db->db_size)
1339-
dmu_buf_will_fill(db, tx);
1340+
dmu_buf_will_fill(db, tx, B_TRUE);
13401341
else
13411342
dmu_buf_will_dirty(db, tx);
13421343

1343-
/*
1344-
* XXX zfs_uiomove could block forever (eg.nfs-backed
1345-
* pages). There needs to be a uiolockdown() function
1346-
* to lock the pages in memory, so that zfs_uiomove won't
1347-
* block.
1348-
*/
13491344
err = zfs_uio_fault_move((char *)db->db_data + bufoff,
13501345
tocpy, UIO_WRITE, uio);
13511346

1352-
if (tocpy == db->db_size)
1353-
dmu_buf_fill_done(db, tx);
1347+
if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
1348+
/* The fill was reverted. Undo any uio progress. */
1349+
zfs_uio_advance(uio, off - zfs_uio_offset(uio));
1350+
}
13541351

13551352
if (err)
13561353
break;

module/zfs/dmu_recv.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
25322532
* size of the provided arc_buf_t.
25332533
*/
25342534
if (db_spill->db_size != drrs->drr_length) {
2535-
dmu_buf_will_fill(db_spill, tx);
2535+
dmu_buf_will_fill(db_spill, tx, B_FALSE);
25362536
VERIFY0(dbuf_spill_set_blksz(db_spill,
25372537
drrs->drr_length, tx));
25382538
}

0 commit comments

Comments
 (0)