diff --git a/src/include/port/atomics/generic-xlc.h b/src/include/port/atomics/generic-xlc.h
index 5ddccff..8b5c732 100644
--- a/src/include/port/atomics/generic-xlc.h
+++ b/src/include/port/atomics/generic-xlc.h
@@ -73,11 +73,27 @@ pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
 static inline uint32
 pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
 {
+	uint32 _t;
+	uint32 res;
+
 	/*
-	 * __fetch_and_add() emits a leading "sync" and trailing "isync", thereby
-	 * providing sequential consistency.  This is undocumented.
+	 * xlc has a no-longer-documented __fetch_and_add() intrinsic.  In xlc
+	 * 12.01.0000.0000, it emits a leading "sync" and trailing "isync".  In
+	 * xlc 13.01.0003.0004, it emits neither.  Hence, using the intrinsic
+	 * would add redundant syncs on xlc 12.
 	 */
-	return __fetch_and_add((volatile int *)&ptr->value, add_);
+	__asm__ __volatile__(
+		"	sync				\n"
+		"	lwarx   %1,0,%4		\n"
+		"	add     %0,%1,%3	\n"
+		"	stwcx.  %0,0,%4		\n"
+		"	bne     $-12		\n"		/* branch to lwarx */
+		"	isync				\n"
+:		"=&r"(_t), "=&r"(res), "+m"(ptr->value)
+:		"r"(add_), "r"(&ptr->value)
+:		"memory", "cc");
+
+	return res;
 }
 
 #ifdef PG_HAVE_ATOMIC_U64_SUPPORT
@@ -103,7 +119,22 @@ pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
 static inline uint64
 pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
 {
-	return __fetch_and_addlp((volatile long *)&ptr->value, add_);
+	uint64 _t;
+	uint64 res;
+
+	/* Like u32, but s/lwarx/ldarx/; s/stwcx/stdcx/ */
+	__asm__ __volatile__(
+		"	sync				\n"
+		"	ldarx   %1,0,%4		\n"
+		"	add     %0,%1,%3	\n"
+		"	stdcx.  %0,0,%4		\n"
+		"	bne     $-12		\n"		/* branch to ldarx */
+		"	isync				\n"
+:		"=&r"(_t), "=&r"(res), "+m"(ptr->value)
+:		"r"(add_), "r"(&ptr->value)
+:		"memory", "cc");
+
+	return res;
 }
 
 #endif /* PG_HAVE_ATOMIC_U64_SUPPORT */