Re: BUG #16691: Autovacuum stops processing certain databases until postgresql rebooted

From: Max Vikharev <bm(dot)kinder(at)gmail(dot)com>
To: Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us>
Cc: Jeff Janes <jeff(dot)janes(at)gmail(dot)com>, pgsql-bugs <pgsql-bugs(at)lists(dot)postgresql(dot)org>
Subject: Re: BUG #16691: Autovacuum stops processing certain databases until postgresql rebooted
Date: 2020-11-03 11:14:32
Message-ID: CACRpr1N3xndgt8Sz6x1b2ChzDk_cQTSk5_C2qSZM0BvngzrhGw@mail.gmail.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-bugs

We are facing a problem right now.

In logs we see that autovacuum is processing mostly one table in database
in 1 second cycle

automatic vacuum of table "DATABASE.pg_catalog.pg_shdepend"

We did strace "postgres: autovacuum launcher". It tries to read from
global/679366931 in cycle.

open("global/679366931", O_RDWR) = 6
lseek(6, 0, SEEK_END) = 16384
kill(11798, SIGUSR1) = 0
open("pg_stat_tmp/global.stat", O_RDONLY) = 7
fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
read(7, "\235\274\245\1\312\177n\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"...,
4096) = 4096
read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"...,
4096) = 4096
read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"...,
4096) = 3100
close(7) = 0
sendto(11, "\1\0\0\0
\0\0\0\6\270v\322.V\2\0\346\26o\322.V\2\0\0\0\0\0\1\0\0\0", 32, 0,
NULL, 0) = 32
select(0, NULL, NULL, NULL, {0, 10000}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGUSR2 {si_signo=SIGUSR2, si_code=SI_USER, si_pid=12052, si_uid=114} ---
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
open("pg_stat_tmp/global.stat", O_RDONLY) = 7
fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"...,
4096) = 4096
read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"...,
4096) = 4096
read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"...,
4096) = 3100
close(7) = 0
open("pg_stat_tmp/global.stat", O_RDONLY) = 7
fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"...,
4096) = 4096
read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"...,
4096) = 4096
read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"...,
4096) = 3100
close(7) = 0
close(6) = 0
open("global/679366931", O_RDWR) = 6
lseek(6, 0, SEEK_END) = 16384
open("pg_stat_tmp/global.stat", O_RDONLY) = 7
fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"...,
4096) = 4096
read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"...,
4096) = 4096
read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"...,
4096) = 3100
close(7) = 0
open("pg_stat_tmp/global.stat", O_RDONLY) = 7
fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"...,
4096) = 4096
read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"...,
4096) = 4096
read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"...,
4096) = 3100
close(7) = 0
epoll_create1(EPOLL_CLOEXEC) = 7
epoll_ctl(7, EPOLL_CTL_ADD, 12, {EPOLLIN|EPOLLERR|EPOLLHUP,
{u32=772071960, u64=94859419771416}}) = 0
epoll_ctl(7, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLERR|EPOLLHUP,
{u32=772071984, u64=94859419771440}}) = 0
close(7) = 0
epoll_create1(EPOLL_CLOEXEC) = 7
epoll_ctl(7, EPOLL_CTL_ADD, 12, {EPOLLIN|EPOLLERR|EPOLLHUP,
{u32=772071960, u64=94859419771416}}) = 0
epoll_ctl(7, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLERR|EPOLLHUP,
{u32=772071984, u64=94859419771440}}) = 0
epoll_wait(7, 0x56462e04e260, 1, 554) = -1 EINTR (Interrupted system call)

lsof

postgres 12202 postgres 6u REG 8,3
16384 47329946 /var/lib/postgresql/12/alytics_main/global/679366931
postgres 30607 postgres 239u REG 8,3
16384 47329946 /var/lib/postgresql/12/alytics_main/global/679366931

DATABASE=# SELECT pg_relation_filepath('pg_database');
pg_relation_filepath
----------------------
global/679366931

> It sounds like it's failing to advance that database's datfrozenxid
> and/or datminmxid. You might try to figure out which table is holding
> those values back by comparing pg_class.relfrozenxid and relminmxid
> to those pg_database columns. Then look to see if autovacuum is
> failing on that table, and/or try a manual vacuum to see what happens.
We found tables with pg_database.datfrozenxid = pg_class.relfrozenxid or
pg_database.datminmxid = pg_class.relminmxid. We executed "vacuum verbose"
on that tables and it did the work correctly, in output we see that there
were 0 dead tuples in all tables and there are logs with ERROR/WARNING log
level.

In postgresql logs we see that after pg_toast were vacuumed autovacuum
started to process next tables

2020-11-03 12:54:31 MSK DATABASE [local] [38649]: [2] STATEMENT:
vacuum verbose pg_toast.pg_toast_687578781;
2020-11-03 12:55:24 MSK DATABASE [local] [38649]: [4] STATEMENT:
vacuum verbose pg_toast.pg_toast_687578787;
2020-11-03 12:56:10 MSK DATABASE [local] [38649]: [6] STATEMENT:
vacuum verbose pg_toast.pg_toast_687578793;
2020-11-03 12:56:32 MSK DATABASE [local] [38649]: [8] STATEMENT:
vacuum verbose pg_toast.pg_toast_687578799;
2020-11-03 12:59:50 MSK DATABASE [33266]: [2] LOG: automatic vacuum
of table "DATABASE.public.another_table": index scans: 1

But it still processes only one database in cycle. I mean that there are
tables in other databases that should be processed and there are free
autovacuum workers, but queued tables are not processed.

вс, 1 нояб. 2020 г. в 20:01, Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us>:

> Max Vikharev <bm(dot)kinder(at)gmail(dot)com> writes:
> >> finishing that table and moving to another in the same database
>
> > Yes, that case. Vacuum processes relations in one database In cycle
>
> It sounds like it's failing to advance that database's datfrozenxid
> and/or datminmxid. You might try to figure out which table is holding
> those values back by comparing pg_class.relfrozenxid and relminmxid
> to those pg_database columns. Then look to see if autovacuum is
> failing on that table, and/or try a manual vacuum to see what happens.
>
> regards, tom lane
>

In response to

Responses

Browse pgsql-bugs by date

  From Date Subject
Next Message Heikki Linnakangas 2020-11-03 12:18:47 Re: BUG #16698: Create extension and search path
Previous Message Magnus Hagander 2020-11-03 11:10:29 Re: BUG #16699: PostgreSQL failed to run "timetz" test on release and debug configuration with MSVC on windows