Re: pg_basebackup: return value 1: reason?

From: Jerry Sievers <gsievers19(at)comcast(dot)net>
To: Andrej Vanek <andrej(dot)vanek(dot)sk(at)gmail(dot)com>
Cc: pgsql-general(at)postgresql(dot)org
Subject: Re: pg_basebackup: return value 1: reason?
Date: 2016-04-15 23:18:15
Message-ID: 86inzimpfs.fsf@jerry.enova.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-general

Andrej Vanek <andrej(dot)vanek(dot)sk(at)gmail(dot)com> writes:

> Hello, 
>
> I tried to run pg_basebackup. Return value is 1.
>
> How to find out its reason?
> (I suspect that some wal after backup is missing- but how to find out the real reason? How to fix it?)

Well there are more than 80 cases of exit 1 in a couple .c files and
most/all are preceeded unsurprisingly with an fprintf(stderr.

Are you not getting something informative on your display and/or sending
stderr to somewher else?

$ echo $*
./src/bin/pg_basebackup/pg_basebackup.c ./src/backend/replication/basebackup.c

$ grep -h -B 4 -i 'exit.*(1' $* | egrep -hi 'fprintf\(stderr|exit'
fprintf(stderr, _("%s: directory name too long\n"), progname);
exit(1);
fprintf(stderr, _("%s: multiple \"=\" signs in tablespace mapping\n"), progname);
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: old directory is not an absolute path in tablespace mapping: %s\n"),
exit(1);
fprintf(stderr, _("%s: new directory is not an absolute path in tablespace mapping: %s\n"),
exit(1);
fprintf(stderr, _("%s: could not read from ready pipe: %s\n"),
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create background process: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create background thread: %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not access directory \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: transfer rate must be greater than zero\n"),
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not write to file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get COPY data stream: %s"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not read COPY data: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get COPY data stream: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not read COPY data: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: invalid tar block header size: %d\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not write to file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
disconnect_and_exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
disconnect_and_exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create file \"%s\": %s\n"), progname, filename, strerror(errno));
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
exit(1);
fprintf(stderr, _("%s: incompatible server version %s\n"),
disconnect_and_exit(1);
disconnect_and_exit(1);
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not initiate base backup: %s"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get backup header: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: no data returned from server\n"), progname);
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: final receive failed: %s"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not wait for child process: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: child %d died, expected %d\n"),
disconnect_and_exit(1);
if (!WIFEXITED(status))
fprintf(stderr, _("%s: child process did not exit normally\n"),
disconnect_and_exit(1);
if (WEXITSTATUS(status) != 0)
fprintf(stderr, _("%s: child process exited with error %d\n"),
progname, WEXITSTATUS(status));
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not wait for child thread: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get child thread exit status: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: child thread exited with error %u\n"),
disconnect_and_exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: invalid compression level \"%s\"\n"),
exit(1);
fprintf(stderr, _("%s: invalid checkpoint argument \"%s\", must be \"fast\" or \"spread\"\n"),
exit(1);
fprintf(stderr, _("%s: invalid status interval \"%s\"\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("%s: no target directory specified\n"), progname);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("%s: transaction log directory location must be "
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: could not create symbolic link \"%s\": %s\n"),
exit(1);
fprintf(stderr, _("%s: symlinks are not supported on this platform\n"));
exit(1);

>
> thanks, Andrej 
> --------------details:
> environment: CentOS 6.7, postgres 9.5.1
> ( PostgreSQL 9.5.1 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.4.7 20120313 (Red Hat 4.4.7-16), 64-bit)
>
> I tried 2 forms of pg_basebackup (-X fetch and -X stream). Both were issued from a script:
> # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X fetch" 2>${LOG_FILE}.stderr  >> ${LOG_FILE}
> # echo $?
> 1             <--------------pg_basebackup failed!
> # cat log.stderr
> # cat /var/log/cluster/geo_repair.log.err
> transaction log start point: 0/E3000028 on timeline 1
> WARNING:  skipping special file "./pg_hba.conf"
> WARNING:  skipping special file "./pg_hba.conf.save"
> transaction log end point: 0/E30000F8
> pg_basebackup: base backup completed            <------------------no reason for pg_basebackup failure!
> # cp /tmp/pg_hba.conf /tmp/postgresql.conf /pg_data/
> # su - postgres -c "/usr/pgsql-9.5/bin/pg_ctl -D /pg_data/ start"
> # tail /pg_data/pg_log/postgresql-Fri.log
> `pg_xlog/0000000100000000000000E2' -> `../backups/arc/0000000100000000000000E2'
> 2016-04-15 23:15:10 CEST:pgreplic(at)[unknown]:[10667] WARNING:  skipping special file "./pg_hba.conf"
> 2016-04-15 23:15:10 CEST:pgreplic(at)[unknown]:[10667] WARNING:  skipping special file "./pg_hba.conf.save"         <---------------recorded in pg_log on master node and
> copied by pg_basebackup (note time difference between two servers)
> 2016-04-15 23:15:02 CEST:@:[23321] LOG:  database system was interrupted; last known up at 2016-04-15 23:15:10 CEST
> 2016-04-15 23:15:02 CEST:postgres(at)postgres:[23329] FATAL:  the database system is starting up
> 2016-04-15 23:15:03 CEST:@:[23321] LOG:  entering standby mode
> 2016-04-15 23:15:03 CEST:@:[23321] LOG:  database system was not properly shut down; automatic recovery in progress <---------something missing from pg_basebackup
> 2016-04-15 23:15:03 CEST:@:[23321] LOG:  redo starts at 0/E3000028
> 2016-04-15 23:15:03 CEST:@:[23321] LOG:  consistent recovery state reached at 0/E4000000
> 2016-04-15 23:15:03 CEST:@:[23295] LOG:  database system is ready to accept read only connections
> 2016-04-15 23:15:03 CEST:@:[23356] LOG:  started streaming WAL from primary at 0/E4000000 on timeline 1
> -------second trial
> # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X stream" 
> # echo $?
> 1
> #  cat /var/log/cluster/geo_repair.log.err 
> transaction log start point: 0/E5000028 on timeline 1
> pg_basebackup: starting background WAL receiver
> WARNING:  skipping special file "./pg_hba.conf"
> WARNING:  skipping special file "./pg_hba.conf.save"
> transaction log end point: 0/E50000F8
> pg_basebackup: waiting for background process to finish streaming ...
> pg_basebackup: could not wait for child process: No child processes        <----what does this mean? I think it failed to start process to fetching wal logs created
> during backup: but neither on master node neither on pg_basebackup output here is any information about reason.. (max_wal_senders on master is 10: I see no reason to
> fail).
>
> postgres logs:
> `pg_xlog/0000000100000000000000E4' -> `../backups/arc/0000000100000000000000E4'
> 2016-04-15 23:35:09 CEST:pgreplic(at)[unknown]:[29035] WARNING:  skipping special file "./pg_hba.conf"
> 2016-04-15 23:35:09 CEST:pgreplic(at)[unknown]:[29035] WARNING:  skipping special file "./pg_hba.conf.save"
> 2016-04-15 23:35:01 CEST:@:[28926] LOG:  database system was interrupted; last known up at 2016-04-15 23:35:09 CEST
> 2016-04-15 23:35:01 CEST:postgres(at)postgres:[28938] FATAL:  the database system is starting up
> 2016-04-15 23:35:02 CEST:@:[28926] LOG:  entering standby mode
> 2016-04-15 23:35:02 CEST:@:[28926] LOG:  database system was not properly shut down; automatic recovery in progress  <------------this means something missing from
> pg_basebackup
> 2016-04-15 23:35:02 CEST:@:[28926] LOG:  redo starts at 0/E5000028
> 2016-04-15 23:35:02 CEST:@:[28926] LOG:  consistent recovery state reached at 0/E6000000
> 2016-04-15 23:35:02 CEST:@:[28904] LOG:  database system is ready to accept read only connections
> 2016-04-15 23:35:02 CEST:@:[28989] LOG:  started streaming WAL from primary at 0/E6000000 on timeline 1
>
> postgres params on master node:
> log_line_prefix = '%t:%u(at)%d:[%p] '
> logging_collector = on
> wal_buffers = 16MB
> max_wal_size = 200MB
> log_temp_files = 1MB
> max_connections = 170
> shared_buffers = 512MB
> effective_cache_size = 1500MB
> work_mem = 48MB
> log_lock_waits = on
> log_min_duration_statement = 10000
> shared_preload_libraries = 'pg_stat_statements'
> include '/var/lib/pgsql/tmp/rep_mode.conf' # added by pgsql RA
> wal_level = hot_standby
> archive_mode = on
> max_wal_senders = 10
> hot_standby = on
> wal_keep_segments = 128
> archive_command = '/opt/postgres/dbconf/archive_command.sh %p %f'
> wal_receiver_status_interval = 2
> max_standby_streaming_delay = -1
> max_standby_archive_delay = -1
> restart_after_crash = off
> hot_standby_feedback = on
>

--
Jerry Sievers
Postgres DBA/Development Consulting
e: postgres(dot)consulting(at)comcast(dot)net
p: 312.241.7800

In response to

Responses

Browse pgsql-general by date

  From Date Subject
Next Message Melvin Davidson 2016-04-15 23:35:16 $foo $bar is BAD
Previous Message Adrian Klaver 2016-04-15 23:17:23 Re: pg_basebackup: return value 1: reason?