Author: Noah Misch Commit: Noah Misch On sparc Linux, write each WAL byte just once. On sparc64, an ext4 filesystem bug can make readers see zeros if another process simultaneously wrote the same offsets. Per buildfarm member kittiwake, this could make a streaming standby fail to advance, reporting corrupt WAL. It could make COMMIT PREPARED fail with "could not read two-phase state from WAL". Non-WAL PostgreSQL reads haven't been affected, likely because those readers and writers have buffering systems in common. Fix this by writing each WAL byte just once, instead of rewriting every byte of a page whenever writing some new byte of that page. This problem is not new, but buildfarm failures became frequent when commits 3cd9c3b921977272e6650a5efbeade4203c4bca2 and f47ed79cc8a0cfa154dc7f01faaf59822552363f added tests with concurrency. Back-patch to v10 (all supported versions). Reviewed by FIXME. Discussion: https://postgr.es/m/20220116210241.GC756210@rfd.leadboat.com diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e073121..68fe510 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -130,6 +130,19 @@ int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; #define NUM_XLOGINSERT_LOCKS 8 /* + * Should XLogWrite() skip the previously-written prefix of the block when + * LogwrtResult falls in the middle of a block, or should it write whole + * blocks always? The latter is traditional PostgreSQL behavior, but the + * former avoids a sparc64+ext4 bug extant as of Linux 5.15.5: + * https://postgr.es/m/20220116210241.GC756210@rfd.leadboat.com + */ +#if defined(__sparc__) && defined(__linux__) +#define SKIP_PREVIOUSLY_WRITTEN 1 +#else +#define SKIP_PREVIOUSLY_WRITTEN 1 /* FIXME change to 0 before commit */ +#endif + +/* * Max distance from last checkpoint, before triggering a new xlog-based * checkpoint. */ @@ -2468,6 +2481,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) bool last_iteration; bool finishing_seg; int curridx; + Size skip; int npages; int startidx; uint32 startoffset; @@ -2483,12 +2497,15 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) /* * Since successive pages in the xlog cache are consecutively allocated, * we can usually gather multiple pages together and issue just one - * write() call. npages is the number of pages we have determined can be - * written together; startidx is the cache block index of the first one, - * and startoffset is the file offset at which it should go. The latter - * two variables are only valid when npages > 0, but we must initialize - * all of them to keep the compiler quiet. - */ + * write() call. skip is the already-written portion of the first page. + * npages is the number of pages we have determined can be written + * together; startidx is the cache block index of the first one, and + * startoffset is the file offset at which the first byte (possibly + * mid-page) should go. The latter two variables are only valid when + * npages > 0, but we must initialize all of them to keep the compiler + * quiet. + */ + skip = 0; npages = 0; startidx = 0; startoffset = 0; @@ -2514,6 +2531,11 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) LSN_FORMAT_ARGS(LogwrtResult.Write), LSN_FORMAT_ARGS(EndPtr)); +#if SKIP_PREVIOUSLY_WRITTEN + if (npages == 0) + skip = LogwrtResult.Write % (Size) XLOG_BLCKSZ; +#endif + /* Advance LogwrtResult.Write to end of current buffer page */ LogwrtResult.Write = EndPtr; ispartialpage = WriteRqst.Write < LogwrtResult.Write; @@ -2552,8 +2574,9 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) { /* first of group */ startidx = curridx; - startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ, - wal_segment_size); + startoffset = + XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ + skip, + wal_segment_size); } npages++; @@ -2579,8 +2602,8 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) instr_time start; /* OK to write the page(s) */ - from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; - nbytes = npages * (Size) XLOG_BLCKSZ; + from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ + skip; + nbytes = npages * (Size) XLOG_BLCKSZ - skip; nleft = nbytes; do {