BUG #14345: run multi-PG9.6 on One Host, hang when checkpoint

From: digoal(at)126(dot)com
To: pgsql-bugs(at)postgresql(dot)org
Subject: BUG #14345: run multi-PG9.6 on One Host, hang when checkpoint
Date: 2016-09-29 05:35:45
Message-ID: 20160929053545.20224.32009@wrigleys.postgresql.org
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-bugs

The following bug has been logged on the website:

Bug reference: 14345
Logged by: Zhou Digoal
Email address: digoal(at)126(dot)com
PostgreSQL version: 9.6rc1
Operating system: CentOS 6.x x64
Description:


When I was using the PostgreSQL 9.6 test machine multiple instances,
encountered the problem of Hang checkpoint.
The following is a test procedure
LVM deployment :
Optional, if your host has multiple pieces of equipment, you can use the LVM
to improve the overall throughput and IOPS capabilities, of course, you can
also use a number of table space to make use of different block devices.
Give an example
```
4 * (3 DISK RAID 5)

4: -i (4 groups, so the band width is set to 4)
16: -I (3-1) * 8K (each group of 3 disk, the actual data disk is 2, so the
strip size is set to 2*8KB (8K index block size, or WAL block size)
Partition and align for example (assuming SSD, need to align))

parted -s /dev/sdb mklable gpt
parted -s /dev/sdc mklable gpt
parted -s /dev/sdd mklable gpt
parted -s /dev/sde mklable gpt
parted -s /dev/sdb mkpart primary 1MiB xxxxGB
parted -s /dev/sdc mkpart primary 1MiB xxxxGB
parted -s /dev/sdd mkpart primary 1MiB xxxxGB
parted -s /dev/sde mkpart primary 1MiB xxxxGB

pvcreate /dev/sd[bcde]1
vgcreate -s 128MB vgdata01 /dev/sd[bcde]1
lvcreate -i 4 -I 16 -n lv01 -L 5T vgdata01
lvcreate -i 4 -I 16 -n lv02 -l 100%FREE vgdata01

mkfs.ext4 /dev/mapper/vgdata01-lv01 -m 0 -O extent,uninit_bg -E
lazy_itable_init=1 -T largefile -L lv01
mkfs.ext4 /dev/mapper/vgdata01-lv02 -m 0 -O extent,uninit_bg -E
lazy_itable_init=1 -T largefile -L lv02
mkdir /u01
mkdir /u02
mount -o defaults,noatime,nodiratime,nodelalloc,barrier=0,data=writeback
LABEL=lv01 /u01
mount -o defaults,noatime,nodiratime,nodelalloc,barrier=0,data=writeback
LABEL=lv02 /u02

mkdir -p /data01/digoal
mkdir -p /data02/digoal
chown digoal /data01/digoal
chown digoal /data02/digoal

echo deadline > /sys/block/sdb/queue/scheduler
echo deadline > /sys/block/sdc/queue/scheduler
echo deadline > /sys/block/sdd/queue/scheduler
echo deadline > /sys/block/sde/queue/scheduler
```

OS parameter
```
1. /etc/sysctl.conf

fs.aio-max-nr = 1048576
fs.file-max = 76724600
kernel.core_pattern= /data01/corefiles/core_%e_%u_%t_%s.%p
kernel.sem = 4096 2147483647 2147483646 512000
kernel.shmall = 107374182
kernel.shmmax = 274877906944
kernel.shmmni = 819200
net.core.netdev_max_backlog = 10000
net.core.rmem_default = 262144
net.core.rmem_max = 4194304
net.core.wmem_default = 262144
net.core.wmem_max = 4194304
net.core.somaxconn = 4096
net.ipv4.tcp_max_syn_backlog = 4096
net.ipv4.tcp_keepalive_intvl = 20
net.ipv4.tcp_keepalive_probes = 3
net.ipv4.tcp_keepalive_time = 60
net.ipv4.tcp_mem = 8388608 12582912 16777216
net.ipv4.tcp_fin_timeout = 5
net.ipv4.tcp_synack_retries = 2
net.ipv4.tcp_syncookies = 1
net.ipv4.tcp_timestamps = 1
net.ipv4.tcp_tw_recycle = 0
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_max_tw_buckets = 262144
net.ipv4.tcp_rmem = 8192 87380 16777216
net.ipv4.tcp_wmem = 8192 65536 16777216
net.nf_conntrack_max = 1200000
net.netfilter.nf_conntrack_max = 1200000
vm.dirty_background_bytes = 409600000
vm.dirty_expire_centisecs = 3000
vm.dirty_ratio = 95
vm.dirty_writeback_centisecs = 100
vm.extra_free_kbytes = 4096000
vm.min_free_kbytes = 2097152
vm.mmap_min_addr = 65536
vm.overcommit_memory = 0
vm.overcommit_ratio = 90
vm.swappiness = 0
vm.zone_reclaim_mode = 0
net.ipv4.ip_local_port_range = 40000 65535
2. /etc/security/limits.conf

* soft nofile 655360
* hard nofile 655360
* soft nproc 655360
* hard nproc 655360
* soft memlock unlimited
* hard memlock unlimited
* soft core unlimited
* hard core unlimited
3. disable thp

cat /sys/kernel/mm/transparent_hugepage/enabled
cat /sys/kernel/mm/transparent_hugepage/defrag
cat /sys/kernel/mm/redhat_transparent_hugepage/enabled
cat /sys/kernel/mm/redhat_transparent_hugepage/defrag

[never]
4. io schedular

/boot/grub/grub.conf

title xxxxxxxxx
root(0,0)
kernel /vmlinuz-............. numa=off elevator=deadline
5. disable selinux

cat /etc/selinux/config

SELINUX=disabled
SELINUXTYPE=targeted
6. cgroup

yum install -y libcgroup

mkdir -p /cgroup/cpu
mkdir -p /cgroup/cpuacct
mkdir -p /cgroup/memory
mkdir -p /cgroup/blkio

mount -t cgroup -o cpu cpu /cgroup/cpu
mount -t cgroup -o cpuacct cpuacct /cgroup/cpuacct
mount -t cgroup -o memory memory /cgroup/memory
mount -t cgroup -o blkio blkio /cgroup/blkio
```

test

100 PostgreSQL clusters, 100 million records for each instance.
Pressure measurement on conflict update insert (pre insertion, late full
update, a larger test of the IO)
Each instance shared buffer 2GB.
(using CGROUP)
Be tested for machine deployment

```

yum -y install rsync coreutils glib2 lrzsz sysstat e4fsprogs xfsprogs ntp
readline-devel zlib zlib-devel openssl openssl-devel pam-devel libxml2-devel
libxslt-devel python-devel tcl-devel gcc make smartmontools flex bison perl
perl-devel perl-ExtUtils* OpenIPMI-tools openldap openldap-devel

tar -jxvf postgresql-9.6rc1.tar.bz2
cd postgresql-9.6rc1
./configure --prefix=/home/digoal/pgsql9.6rc1
make world -j 32
make install-world

$ vi ~/env.sh
export PS1="$USER(at)`/bin/hostname -s`-> "
export PGPORT=$1
export PGDATA=/data02/digoal/pg_root$PGPORT
export LANG=en_US.utf8
export PGHOME=/home/digoal/pgsql9.6rc1
export
LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGHOST=$PGDATA
export PGUSER=postgres
export PGDATABASE=postgres
alias rm='rm -i'
alias ll='ls -lh'
unalias vi

$ vi ~/init.sh

for ((i=1921;i<1921+$1;i++))
do
. ~/env.sh $i
initdb -D $PGDATA -E UTF8 --locale=C -U postgres -X
/data01/digoal/pg_xlog$i
echo "local all all
trust" > $PGDATA/pg_hba.conf
echo "host all all 127.0.0.1/32
trust" >> $PGDATA/pg_hba.conf
echo "host all all ::1/128
trust" >> $PGDATA/pg_hba.conf
echo "host all all 0.0.0.0/0
trust" >> $PGDATA/pg_hba.conf
done

$ rm -rf /data01/digoal/*
$ rm -rf /data02/digoal/*
$ cd ~
$ . ~/init.sh 100

# dmsetup ls
vgdata01-lv02 (253, 1)
vgdata01-lv01 (253, 0)

xlog iops 4000
data iops 800
cpu 0.7core, cycle 1sec, quota 0.7sec
memory, 4G

$ vi ~/start.sh

for ((i=1921;i<1921+$1;i++))
do
. /home/digoal/env.sh $i
cgcreate -g cpu:RULE$i
cgcreate -g cpuacct:RULE$i
cgcreate -g memory:RULE$i
cgcreate -g blkio:RULE$i

echo "253:0 4000" >
/cgroup/blkio/RULE$i/blkio.throttle.write_iops_device
echo "253:0 4000" > /cgroup/blkio/RULE$i/blkio.throttle.read_iops_device
echo "253:1 800" > /cgroup/blkio/RULE$i/blkio.throttle.write_iops_device
echo "253:1 800" > /cgroup/blkio/RULE$i/blkio.throttle.read_iops_device
echo "70" > /cgroup/cpu/RULE$i/cpu.shares
echo "1000000" > /cgroup/cpu/RULE$i/cpu.cfs_period_us
echo "700000" > /cgroup/cpu/RULE$i/cpu.cfs_quota_us
echo "1000000" > /cgroup/cpu/RULE$i/cpu.rt_period_us
echo "1000" > /cgroup/cpu/RULE$i/cpu.rt_runtime_us
echo "4294967296" > /cgroup/memory/RULE$i/memory.limit_in_bytes

cgexec -g cpu:RULE$i -g cpuacct:RULE$i -g memory:RULE$i -g blkio:RULE$i su
- digoal -c ". ~/env.sh $i ; nohup postgres -B 1GB -c port=$i -c
listen_addresses='0.0.0.0' -c synchronous_commit=on -c full_page_writes=on
-c wal_buffers=128MB -c wal_writer_flush_after=0 -c bgwriter_delay=10ms -c
max_connections=100 -c bgwriter_lru_maxpages=1000 -c
bgwriter_lru_multiplier=10.0 -c unix_socket_directories='.' -c
max_wal_size=4GB -c checkpoint_timeout=30min -c
checkpoint_completion_target=0.5 -c log_checkpoints=on -c log_connections=on
-c log_disconnections=on -c log_error_verbosity=verbose -c
autovacuum_vacuum_scale_factor=0.002 -c autovacuum_max_workers=4 -c
autovacuum_naptime=5s -c random_page_cost=1.0 -c constraint_exclusion=on -c
log_destination='csvlog' -c logging_collector=on -c
maintenance_work_mem=256MB -c autovacuum_work_mem=256MB -D $PGDATA -k
$PGDATA >/dev/null 2>&1 &"
done

$ sudo bash -c "bash"
# . /home/digoal/start.sh 100

$ vi ~/stop.sh

for ((i=1921;i<1921+$1;i++))
do
. /home/digoal/env.sh $i
cgdelete cpu:RULE$i
cgdelete cpuacct:RULE$i
cgdelete blkio:RULE$i
su - digoal -c ". ~/env.sh $i ; pg_ctl stop -m fast -w -t 600 -D
$PGDATA"
echo "0" > /cgroup/memory/RULE$i/memory.force_empty
cgdelete memory:RULE$i
done

$ sudo bash -c "bash"
# . /home/digoal/stop.sh 100
```

test client
```

mkdir /data01/digoal
chown digoal /data01/digoal

$ vi ~/env.sh

export PS1="$USER(at)`/bin/hostname -s`-> "
export PGPORT=$1
export PGDATA=/data02/digoal/pg_root$PGPORT
export LANG=en_US.utf8
export PGHOME=/home/digoal/pgsql9.6rc1
export
LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGHOST=$PGDATA
export PGUSER=postgres
export PGDATABASE=postgres
alias rm='rm -i'
alias ll='ls -lh'
unalias vi

$ vi ~/pgbench_init.sh

for ((i=1921;i<1921+$1;i++))
do
. ~/env.sh $i
export PGHOST=$2
psql -c "drop table if exists test; create table test(id int primary key,
info text, crt_time timestamp);"
echo "\set id random(1,100000000)" > ~/test$i.sql
echo "insert into test (id,info,crt_time) values (:id,
md5(random()::text), now()) on conflict on constraint test_pkey do update
set info=excluded.info, crt_time=excluded.crt_time;" >> ~/test$i.sql
done

$ vi ~/pgbench.sh

mkdir -p /data01/digoal/log
for ((i=1921;i<1921+$1;i++))
do
. ~/env.sh $i
export PGHOST=$2
nohup pgbench -M prepared -f ~/test$i.sql -n -r -P 1 -c 4 -j 4 -T 2592000
>>/data01/digoal/log/$2_$i.log 2>&1 & # 每个实例测试4个连接
done

cd ~
. ~/pgbench.sh 100 $dest_ip

$ vi test.sh

#!/bin/bash

cd ~
. ~/pgbench_init.sh 100 $1

for ((i=1;i>0;i=1))
do
sleep 1
CNT=`ps -ewf|grep pgbench|grep -c -v grep`
if [ $CNT -eq 0 ]; then
. ~/pgbench.sh 100 $1
exit
fi
sleep 1
done

$ chmod 500 test.sh

nohup ./test.sh $dest_ip >/dev/null 2>&1 &
```

result :
HANG when checkpoint (fsync pharse):
```
$ cat /data01/digoal/log/$2_$i.log

$ head -n 30000 /data01/digoal/log/$2_$i.log |tail -n 7200 > /tmp/1
$ cat /tmp/1|awk '{print $4 "," $7 "," $10}' >/tmp/2

while(true) ; do cat /proc/meminfo |grep -E "Dirty|Writeback"; sleep 0.5;
done

Dirty: 24752872 kB
Writeback: 11312408 kB
WritebackTmp: 0 kB

progress: 1321.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1322.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1323.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1324.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1325.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1326.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1327.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1328.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1329.0 s, 0.0 tps, lat -nan ms stddev -nan
progress: 1330.0 s, 0.0 tps, lat -nan ms stddev -nan
```

process stats
```

PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
49799 digoal 20 0 1300m 155m 155m S 0.0 0.0 0:00.59 postgres -B 1GB
-c port=1922 -c listen_addresses=0.0.0.0 -c synchronous_commit=on -c
full_page_writes=on -c wal_buffers=128MB -c wal_writer_flush_after=0 -c
bgwriter_delay=10ms
49844 digoal 20 0 1300m 129m 128m S 0.0 0.0 0:09.01 postgres: wal
writer process


49845 digoal 20 0 1300m 1952 1224 S 0.0 0.0 0:05.71 postgres:
autovacuum launcher process


49838 digoal 20 0 113m 892 460 S 0.0 0.0 0:00.03 postgres: logger
process


16531 digoal 20 0 1300m 1.1g 1.1g D 0.0 0.2 1:22.71 postgres:
postgres postgres 127.0.0.1(49777) INSERT


16534 digoal 20 0 1300m 1.1g 1.1g D 0.0 0.2 1:22.32 postgres:
postgres postgres 127.0.0.1(49778) INSERT


16535 digoal 20 0 1300m 1.1g 1.1g D 0.0 0.2 1:22.73 postgres:
postgres postgres 127.0.0.1(49780) INSERT


16537 digoal 20 0 1300m 1.1g 1.1g D 0.0 0.2 1:22.43 postgres:
postgres postgres 127.0.0.1(49781) INSERT


49842 digoal 20 0 1301m 1.0g 1.0g D 0.0 0.2 0:23.70 postgres:
checkpointer process


49846 digoal 20 0 115m 1048 552 D 0.0 0.0 0:12.83 postgres: stats
collector process


49843 digoal 20 0 1300m 978m 977m D 0.0 0.2 0:46.35 postgres: writer
process
```

pstack
```

checkpointer

cat /proc/49842/stack
[<ffffffff81121281>] generic_file_aio_write+0x71/0x100
[<ffffffffa00c0463>] ext4_file_write+0x43/0xe0 [ext4]
[<ffffffff8118863a>] do_sync_write+0xfa/0x140
[<ffffffff81188938>] vfs_write+0xb8/0x1a0
[<ffffffff81189231>] sys_write+0x51/0x90
[<ffffffff8100c072>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

stats

cat /proc/49846/stack
[<ffffffffa00a708a>] start_this_handle+0x25a/0x480 [jbd2]
[<ffffffffa00a7495>] jbd2_journal_start+0xb5/0x100 [jbd2]
[<ffffffffa00e4b24>] ext4_journal_start_sb+0x74/0x140 [ext4]
[<ffffffffa00d20ba>] ext4_create+0x7a/0x150 [ext4]
[<ffffffff811972c4>] vfs_create+0xb4/0xe0
[<ffffffff8119ad90>] do_filp_open+0xb10/0xdd0
[<ffffffff81185829>] do_sys_open+0x69/0x140
[<ffffffff81185940>] sys_open+0x20/0x30
[<ffffffff8100c072>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

bgwriter

cat /proc/49843/stack
[<ffffffffa00a708a>] start_this_handle+0x25a/0x480 [jbd2]
[<ffffffffa00a7495>] jbd2_journal_start+0xb5/0x100 [jbd2]
[<ffffffffa00e4b24>] ext4_journal_start_sb+0x74/0x140 [ext4]
[<ffffffffa00c896a>] ext4_dirty_inode+0x2a/0x60 [ext4]
[<ffffffff811b461b>] __mark_inode_dirty+0x3b/0x160
[<ffffffff811a3e12>] file_update_time+0xf2/0x170
[<ffffffff81120fb0>] __generic_file_aio_write+0x230/0x490
[<ffffffff81121298>] generic_file_aio_write+0x88/0x100
[<ffffffffa00c0463>] ext4_file_write+0x43/0xe0 [ext4]
[<ffffffff8118863a>] do_sync_write+0xfa/0x140
[<ffffffff81188938>] vfs_write+0xb8/0x1a0
[<ffffffff81189231>] sys_write+0x51/0x90
[<ffffffff8100c072>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

backend process

cat /proc/16537/stack
[<ffffffffa00bfff0>] ext4_llseek+0x60/0x110 [ext4]
[<ffffffff81186eda>] vfs_llseek+0x3a/0x40
[<ffffffff81188b96>] sys_lseek+0x66/0x80
[<ffffffff8100c072>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

logger

cat /proc/49838/stack
[<ffffffffa00a708a>] start_this_handle+0x25a/0x480 [jbd2]
[<ffffffffa00a7495>] jbd2_journal_start+0xb5/0x100 [jbd2]
[<ffffffffa00e4b24>] ext4_journal_start_sb+0x74/0x140 [ext4]
[<ffffffffa00c896a>] ext4_dirty_inode+0x2a/0x60 [ext4]
[<ffffffff811b461b>] __mark_inode_dirty+0x3b/0x160
[<ffffffff811a3e12>] file_update_time+0xf2/0x170
[<ffffffff81120fb0>] __generic_file_aio_write+0x230/0x490
[<ffffffff81121298>] generic_file_aio_write+0x88/0x100
[<ffffffffa00c0463>] ext4_file_write+0x43/0xe0 [ext4]
[<ffffffff8118863a>] do_sync_write+0xfa/0x140
[<ffffffff81188938>] vfs_write+0xb8/0x1a0
[<ffffffff81189231>] sys_write+0x51/0x90
[<ffffffff8100c072>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

wal writer

cat /proc/49844/stack
[<ffffffff811d0bfd>] ep_poll+0x2ad/0x330
[<ffffffff811d0d45>] sys_epoll_wait+0xc5/0xe0
[<ffffffff8100c072>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

```

file system & os dirty page flush parameter
```
/dev/mapper/vgdata01-lv01 on /u01 type ext4
(rw,noatime,nodiratime,nodelalloc,barrier=0,data=writeback)
/dev/mapper/vgdata01-lv02 on /u02 type ext4
(rw,noatime,nodiratime,nodelalloc,barrier=0,data=writeback)

vm.dirty_background_ratio = 0
vm.dirty_background_bytes = 102400000
vm.dirty_ratio = 95
vm.dirty_bytes = 0
vm.dirty_writeback_centisecs = 100
vm.dirty_expire_centisecs = 3000
```

Responses

Browse pgsql-bugs by date

  From Date Subject
Next Message David Rowley 2016-09-29 05:41:21 Re: BUG #14344: string_agg(DISTINCT ..) crash
Previous Message Michael Paquier 2016-09-29 04:36:25 Re: BUG #14344: string_agg(DISTINCT ..) crash