Re: BUG #16691: Autovacuum stops processing certain databases until postgresql rebooted

From: Max Vikharev <bm(dot)kinder(at)gmail(dot)com>
To: Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us>
Cc: Jeff Janes <jeff(dot)janes(at)gmail(dot)com>, pgsql-bugs <pgsql-bugs(at)lists(dot)postgresql(dot)org>
Subject: Re: BUG #16691: Autovacuum stops processing certain databases until postgresql rebooted
Date: 2020-12-17 13:26:38
Message-ID: CACRpr1NSNNR77jZ5oY0viv1tm-jvAvtPE1cwwXZv2pWs7qER5Q@mail.gmail.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-bugs

Hi, Dear PostgreSQL Developers.
We are still facing this issue.
We reboot the cluster every 5-7 days.Any help plz?

вт, 3 нояб. 2020 г. в 14:14, Max Vikharev <bm(dot)kinder(at)gmail(dot)com>:

> We are facing a problem right now.
>
> In logs we see that autovacuum is processing mostly one table in database
> in 1 second cycle
>
> automatic vacuum of table "DATABASE.pg_catalog.pg_shdepend"
>
> We did strace "postgres: autovacuum launcher". It tries to read from
> global/679366931 in cycle.
>
> open("global/679366931", O_RDWR) = 6
> lseek(6, 0, SEEK_END) = 16384
> kill(11798, SIGUSR1) = 0
> open("pg_stat_tmp/global.stat", O_RDONLY) = 7
> fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
> read(7, "\235\274\245\1\312\177n\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"..., 4096) = 4096
> read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"..., 4096) = 4096
> read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"..., 4096) = 3100
> close(7) = 0
> sendto(11, "\1\0\0\0 \0\0\0\6\270v\322.V\2\0\346\26o\322.V\2\0\0\0\0\0\1\0\0\0", 32, 0, NULL, 0) = 32
> select(0, NULL, NULL, NULL, {0, 10000}) = ? ERESTARTNOHAND (To be restarted if no handler)
> --- SIGUSR2 {si_signo=SIGUSR2, si_code=SI_USER, si_pid=12052, si_uid=114} ---
> rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
> open("pg_stat_tmp/global.stat", O_RDONLY) = 7
> fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
> read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"..., 4096) = 4096
> read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"..., 4096) = 4096
> read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"..., 4096) = 3100
> close(7) = 0
> open("pg_stat_tmp/global.stat", O_RDONLY) = 7
> fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
> read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"..., 4096) = 4096
> read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"..., 4096) = 4096
> read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"..., 4096) = 3100
> close(7) = 0
> close(6) = 0
> open("global/679366931", O_RDWR) = 6
> lseek(6, 0, SEEK_END) = 16384
> open("pg_stat_tmp/global.stat", O_RDONLY) = 7
> fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
> read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"..., 4096) = 4096
> read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"..., 4096) = 4096
> read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"..., 4096) = 3100
> close(7) = 0
> open("pg_stat_tmp/global.stat", O_RDONLY) = 7
> fstat(7, {st_mode=S_IFREG|0600, st_size=11292, ...}) = 0
> read(7, "\235\274\245\1z\270v\322.V\2\0'\v\0\0\0\0\0\0\4U\0\0\0\0\0\0u\326\313\26"..., 4096) = 4096
> read(7, "iZ\272\17\344\0\0\0w\337\334}\343\0\0\0000\362\255\0\0\0\0\0\265\241\332\0\0\0\0\0"..., 4096) = 4096
> read(7, "\0\0\0\0\0\0\0\0\0\0\0\0U\2\0\0\0\0\0\0\0\0B\277\0\0\0\0\1\0\0\0"..., 4096) = 3100
> close(7) = 0
> epoll_create1(EPOLL_CLOEXEC) = 7
> epoll_ctl(7, EPOLL_CTL_ADD, 12, {EPOLLIN|EPOLLERR|EPOLLHUP, {u32=772071960, u64=94859419771416}}) = 0
> epoll_ctl(7, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLERR|EPOLLHUP, {u32=772071984, u64=94859419771440}}) = 0
> close(7) = 0
> epoll_create1(EPOLL_CLOEXEC) = 7
> epoll_ctl(7, EPOLL_CTL_ADD, 12, {EPOLLIN|EPOLLERR|EPOLLHUP, {u32=772071960, u64=94859419771416}}) = 0
> epoll_ctl(7, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLERR|EPOLLHUP, {u32=772071984, u64=94859419771440}}) = 0
> epoll_wait(7, 0x56462e04e260, 1, 554) = -1 EINTR (Interrupted system call)
>
> lsof
>
> postgres 12202 postgres 6u REG 8,3 16384 47329946 /var/lib/postgresql/12/alytics_main/global/679366931
> postgres 30607 postgres 239u REG 8,3 16384 47329946 /var/lib/postgresql/12/alytics_main/global/679366931
>
> DATABASE=# SELECT pg_relation_filepath('pg_database');
> pg_relation_filepath
> ----------------------
> global/679366931
>
>
> > It sounds like it's failing to advance that database's datfrozenxid
> > and/or datminmxid. You might try to figure out which table is holding
> > those values back by comparing pg_class.relfrozenxid and relminmxid
> > to those pg_database columns. Then look to see if autovacuum is
> > failing on that table, and/or try a manual vacuum to see what happens.
> We found tables with pg_database.datfrozenxid = pg_class.relfrozenxid or
> pg_database.datminmxid = pg_class.relminmxid. We executed "vacuum verbose"
> on that tables and it did the work correctly, in output we see that there
> were 0 dead tuples in all tables and there are logs with ERROR/WARNING log
> level.
>
> In postgresql logs we see that after pg_toast were vacuumed autovacuum
> started to process next tables
>
> 2020-11-03 12:54:31 MSK DATABASE [local] [38649]: [2] STATEMENT: vacuum verbose pg_toast.pg_toast_687578781;
> 2020-11-03 12:55:24 MSK DATABASE [local] [38649]: [4] STATEMENT: vacuum verbose pg_toast.pg_toast_687578787;
> 2020-11-03 12:56:10 MSK DATABASE [local] [38649]: [6] STATEMENT: vacuum verbose pg_toast.pg_toast_687578793;
> 2020-11-03 12:56:32 MSK DATABASE [local] [38649]: [8] STATEMENT: vacuum verbose pg_toast.pg_toast_687578799;
> 2020-11-03 12:59:50 MSK DATABASE [33266]: [2] LOG: automatic vacuum of table "DATABASE.public.another_table": index scans: 1
>
> But it still processes only one database in cycle. I mean that there are
> tables in other databases that should be processed and there are free
> autovacuum workers, but queued tables are not processed.
>
> вс, 1 нояб. 2020 г. в 20:01, Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us>:
>
>> Max Vikharev <bm(dot)kinder(at)gmail(dot)com> writes:
>> >> finishing that table and moving to another in the same database
>>
>> > Yes, that case. Vacuum processes relations in one database In cycle
>>
>> It sounds like it's failing to advance that database's datfrozenxid
>> and/or datminmxid. You might try to figure out which table is holding
>> those values back by comparing pg_class.relfrozenxid and relminmxid
>> to those pg_database columns. Then look to see if autovacuum is
>> failing on that table, and/or try a manual vacuum to see what happens.
>>
>> regards, tom lane
>>
>

In response to

Responses

Browse pgsql-bugs by date

  From Date Subject
Next Message Bertrand Guillaumin 2020-12-17 17:36:45 Re: BUG #16759: Estimation of the planner is wrong for hash join
Previous Message Michael Paquier 2020-12-17 07:19:45 Re: BUG #16758: create temporary table with the same name loses defaults, indexes