/*-------------------------------------------------------------------------
 *
 * walsender.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <signal.h>

#include "access/xlog_internal.h"
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "postmaster/walsender.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lwlock.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "utils/guc.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
#include "utils/resowner.h"

/*
 * Shared memory area for communication between walsender and the other
 */
typedef struct
{
	pid_t	pid;	/* PID of backend, for requesting */
	
	/*
	 * wait_sending = true if the backend is waiting for walsender to send
	 * xlog, or receive the reply from the standby otherwise (i.e. the
	 * standby received xlog).
	 */
	bool	wait_sending;

	/* 
	 * requestLSN indicates a byte position that walsender is requested to
	 * send xlog or receive the reply.
	 */
	XLogRecPtr	requestLSN;
} WalSenderRequest;

typedef struct
{
	pid_t	walsender_pid;	/* PID of walsender (0 if not started) */

	/* Protected by WALSenderCommLock */
	int			num_requests;	/* current # of requests */
	int			max_requests;	/* allocated array size */
	WalSenderRequest requests[1];	/* VARIABLE LENGTH ARRAY */
} WalSenderShmemStruct;

static WalSenderShmemStruct *WalSenderShmem;

/*
 * Global state
 */
bool	am_walsender	= false;

/*
 * User-settable parameters for replication
 */
bool	XLogSyncReplication		= true;
int		XLogReplicationTimeout	= 0;
int		WalSenderDelay 			= 200;	/* max sleep time between some actions */

/*
 * Flags set by interrupt handlers for later service in the main loop.
 */
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t shutdown_requested = false;
static volatile sig_atomic_t replication_requested = false;

/* Signal handlers */
static void RepSigHupHandler(SIGNAL_ARGS);
static void RepTimeoutHandler(SIGNAL_ARGS);
static void RepShutdownHandler(SIGNAL_ARGS);
static void RepQuickDieHandler(SIGNAL_ARGS);
static void RepRequestHandler(SIGNAL_ARGS);

/* Prototypes for private functions */
static int	WalSenderLoop(void);
static void WalSenderParseInput(void);
static void DeclareWalSender(void);
static void WalSenderShmemCleanup(int, Datum);
static void WalSenderRequestComplete(void);

/*
 * Main entry point for walsender process
 */
int
WalSenderMain(void)
{
	sigjmp_buf	local_sigjmp_buf;
	MemoryContext walsender_context;

	/* Declare that I am walsender first */
	DeclareWalSender();

	/*
	 * Set up signal handlers and masks again.
	 */
	pqsignal(SIGHUP, RepSigHupHandler); /* set flag to read config file */
	pqsignal(SIGINT, RepTimeoutHandler);		/* detect timeout */
	pqsignal(SIGTERM, RepShutdownHandler);		/* request shutdown */
	pqsignal(SIGQUIT, RepQuickDieHandler);	/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, RepRequestHandler); /* request replication */
	pqsignal(SIGUSR2, SIG_IGN); /* not used */

	/*
	 * Reset some signals
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/* We allow SIGQUIT (quickdie) at all times */
#ifdef HAVE_SIGPROCMASK
	sigdelset(&BlockSig, SIGQUIT);
#else
	BlockSig &= ~(sigmask(SIGQUIT));
#endif

	/*
	 * Create a resource owner to keep track of our resources (not clear that
	 * we need this, but may as well have one).
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Sender");

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.  Formerly this code just ran in
	 * TopMemoryContext, but resetting that would be a really bad idea.
	 */
	walsender_context = AllocSetContextCreate(TopMemoryContext,
											  "Wal Sender",
											  ALLOCSET_DEFAULT_MINSIZE,
											  ALLOCSET_DEFAULT_INITSIZE,
											  ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(walsender_context);

	/* Set up walsender-exit callback to clean the shmem up */
	on_shmem_exit(WalSenderShmemCleanup, 0);

	/*
	 * Save stack context for handling an exception again.
	 *
	 * This code is heavily based on bgwriter.c, q.v.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevent interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * These operations are really just a minimal subset of
		 * AbortTransaction().	We don't have very many resources to worry
		 * about in walsender, but we do have LWLocks, and perhaps buffers?
		 */
		LWLockReleaseAll();
		AbortBufferIO();
		UnlockBuffers();
		/* buffer pins are released here: */
		ResourceOwnerRelease(CurrentResourceOwner,
							 RESOURCE_RELEASE_BEFORE_LOCKS,
							 false, true);
		/* we needn't bother with the other ResourceOwnerRelease phases */
		AtEOXact_Buffers(false);
		AtEOXact_Files();
		AtEOXact_HashTables(false);

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(walsender_context);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(walsender_context);

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  A write error is likely
		 * to be repeated, and we don't want to be filling the error logs as
		 * fast as we can.
		 */
		pg_usleep(1000000L);

		/*
		 * Close all open files after any error.  This is helpful on Windows,
		 * where holding deleted files open causes various strange errors.
		 * It's not clear we need it elsewhere, but shouldn't hurt.
		 */
		smgrcloseall();
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	/*
	 * Send current timeline ID to the standby
	 */
	{
		StringInfoData buf;
		
		pq_beginmessage(&buf, 'l');
		pq_sendint(&buf, (int32) ThisTimeLineID, sizeof(int32));
		pq_endmessage(&buf);
		/* Need not flush since xlog will be sent. */
	}

	/*
	 * Switch to a new xlog segment and start replication from the head of it.
	 * Because, the standby cannot create complete xlog segment if replication 
	 * starts from the middle of segment.
	 */
	RequestXLogSwitch();

	/*
	 * Report the position where replication starts
	 */
	{
		XLogRecPtr recptr = LogsndResult.Send;
		
		if (recptr.xrecoff == XLogFileSize)
		{
			recptr.xlogid  += 1;
			recptr.xrecoff  = 0;
		}
		recptr.xrecoff += 1;
		
		ereport(LOG,
				(errmsg("replication starts at %X/%X",
						recptr.xlogid, recptr.xrecoff)));
	}
	
	return WalSenderLoop();
}

/*
 * Main loop of walsender
 */
static int
WalSenderLoop(void)
{
	/*
	 * Loop forever
	 */
	for (;;)
	{
		long	udelay;	/* in microseconds */

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (!PostmasterIsAlive(true))
			exit(1);

		/*
		 * Process any requests or signals received recently.
		 */
		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
		}
		if (shutdown_requested)
		{
			/* Normal exit from the walsender is here */
			proc_exit(0);		/* done */
		}

		/*
		 * Send xlog periodically.
		 */
		if (XLogSend())
			WalSenderRequestComplete();
		replication_requested = false;
		
		/*
		 * Nap for the configured time or until a request arrives.
		 *
		 * On some platforms, signals won't interrupt the sleep.  To ensure we
		 * respond reasonably promptly when someone signals us, break down the
		 * sleep into 1-second increments, and check for interrupts after each
		 * nap.
		 */
		udelay = WalSenderDelay * 1000L;
		while (udelay > 0)
		{
			int		selres;
			fd_set	rmask;
			struct timeval timeout;
			
			if (got_SIGHUP || shutdown_requested || replication_requested)
				break;
			
			FD_ZERO(&rmask);
			FD_SET(MyProcPort->sock, &rmask);
			
			if (udelay < 1000000L)
			{
				timeout.tv_sec	= 0;
				timeout.tv_usec	= udelay;
			}
			else
			{
				timeout.tv_sec	= 1;
				timeout.tv_usec	= 0;
			}

			selres = select(MyProcPort->sock + 1, &rmask, NULL, NULL, &timeout);

			if (selres < 0)
			{
				if (errno != EINTR && errno != EWOULDBLOCK)
					ereport(FATAL,
							(errcode_for_socket_access(),
							 errmsg("select() failed in walsender: %m")));
			}

			if (selres > 0)
			{
				if (FD_ISSET(MyProcPort->sock, &rmask))
				{
					/*
					 * Load some bytes into the input buffer from the connection 
					 * since select(2) ensure that some data has arrived.
					 */
					if (pq_recvbuf() == EOF)			/* standby disconnected */
						ereport(FATAL,
								(errcode(ERRCODE_PROTOCOL_VIOLATION),
								 errmsg("unexpected EOF on replication connection")));
					
					/* Parse any available data */
					WalSenderParseInput();
					WalSenderRequestComplete();
				}
				break;
			}
			
			udelay -= 1000000L;
		}
	}
	
	/* can't get here because the above loop never exits */
	return 1;
}

/*
 * Parse input data until input is exhausted.
 *
 * NOTE: this function will NOT attempt to read more data from the standby
 */
static void
WalSenderParseInput(void)
{
	char	qtype;
	int32	len;
	
	/*
	 * Loop to parse successive complete messages available in the buffer.
	 */
	for (;;)
	{
		/*
		 * If we cannot read both message type and body at the same time, 
		 * we give up getting them this time. Leave them on the buffer 
		 * till next time.
		 */
		qtype = pq_peekbufbyte();
		if (qtype == EOF)
			return;		/* exhausted */
		
		switch (qtype)
		{
			case 'r':
			{
				StringInfoData buf;
				XLogRecPtr  recptr;
				
				initStringInfo(&buf);
				
				/*
				 * It's assumed that buffer size is larger than message
				 * lenght. Otherwise, we can never get complete message.
				 * Now, there seems to be no danger that message will be
				 * ungettable since its length is at 13.
				 */
				if (pq_getbufbytes((char *) buf.data, 13) == EOF)
					return;		/* exhausted */
				
				buf.len = 13;
				buf.data[buf.len] = '\0';

				qtype	= pq_getmsgbyte(&buf);
				len		= pq_getmsgint(&buf, 4);
				if (len != 12)	/* validate message length */
					ereport(FATAL,
							(errcode(ERRCODE_PROTOCOL_VIOLATION),
							 errmsg("invalid standby message length %d, type %d",
									len, qtype)));
				/*
				 * Update XLogCtl->LogsndResult.Recv to received LSN
				 */
				recptr.xlogid	= pq_getmsgint(&buf, 4);
				recptr.xrecoff	= pq_getmsgint(&buf, 4);
				UpdateLogsndStatus(recptr);

				pfree(buf.data);
				buf.data = NULL;

				break;
			}

			case 'X':
				/* Normal exit from the walsender */
				proc_exit(0);
			
			default:
				ereport(FATAL,
						(errcode(ERRCODE_PROTOCOL_VIOLATION),
						 errmsg("invalid standby message type %d",
								qtype)));
		}
	}
}


/* --------------------------------
 *		routines for mimic
 * --------------------------------
 */

/*
 * Declare that I am walsender, tell it to postmaster.
 *
 * This is called when a backend receives "mimic walsender" message.
 */
static void
DeclareWalSender(void)
{
	char remote_ps_data[NI_MAXHOST];
	XLogRecPtr recptr;

	if (WalSenderShmem->walsender_pid != 0)
		ereport(FATAL,
				(errmsg("sorry, too many walsenders already"),
				 errdetail("walsender (pid = %d) is already started",
						 WalSenderShmem->walsender_pid)));
	
	/*
	 * Setting up walsender_pid triggers backends to start requesting
	 * replication. So, we initialize the xlog positions for replication
	 * to the max number, ignore those requests till walsender switches
	 * xlog segments.
	 */
	recptr.xlogid = recptr.xrecoff = 0xFFFFFFFF;
	InitXLogSend(recptr);

	WalSenderShmem->walsender_pid = MyProcPid;

	/* identify myself via ps */
   	snprintf(remote_ps_data, sizeof(remote_ps_data),
			 MyProcPort->remote_port[0] == '\0' ? "%s" : "%s(%s)",
			 MyProcPort->remote_host, MyProcPort->remote_port);
	init_ps_display("wal sender process", MyProcPort->user_name, 
					remote_ps_data, "");

	am_walsender = true;

	SendPostmasterSignal(PMSIGNAL_DECLARE_WALSENDER);
}


/* --------------------------------
 *		signal handler routines
 * --------------------------------
 */

/* SIGHUP: set flag to re-read config file at next convenient time */
static void
RepSigHupHandler(SIGNAL_ARGS)
{
	got_SIGHUP = true;
}

/* SIGINT: terminate walsender due to replication timeout */
static void
RepTimeoutHandler(SIGNAL_ARGS)
{
	ereport(FATAL,
			(errmsg("terminating walsender due to replication timeout"),
			 errdetail("xlog has been replicated to %X/%X, possibly %X/%X",
					   LogsndResult.Recv.xlogid, LogsndResult.Recv.xrecoff,
					   LogsndResult.Send.xlogid, LogsndResult.Send.xrecoff)));
}

/* SIGTERM: set flag to exit normally */
static void
RepShutdownHandler(SIGNAL_ARGS)
{
	shutdown_requested = true;
}

/* 
 * SIGQUIT: exit quickly
 * 
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
 */
static void
RepQuickDieHandler(SIGNAL_ARGS)
{
	PG_SETMASK(&BlockSig);
	
	/*
	 * DO NOT proc_exit() -- we're here because shared memory may be
	 * corrupted, so we don't want to try to clean up our transaction. Just
	 * nail the windows shut and get out of town.
	 *
	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
	 * backend.  This is necessary precisely because we don't clean up our
	 * shared memory state.
	 */
	exit(2);
}

/* SIGUSR1: set flag to do replication */
static void
RepRequestHandler(SIGNAL_ARGS)
{
	replication_requested = true;
}


/* --------------------------------
 *		communication with the other process
 * --------------------------------
 */

/*
 * Compute space needed for walsender-related shared memory
 */
Size
WalSenderShmemSize(void)
{
	Size size;
	
	size = offsetof(WalSenderShmemStruct, requests);
	/* auxiiary processes */
	size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(WalSenderRequest)));
	/* backends, including autovacuum */
	size = add_size(size, mul_size(MaxBackends, sizeof(WalSenderRequest)));

	return size;
}

/*
 * Allocate and initialize walsender-related shared memory
 */
void
WalSenderShmemInit(void)
{
	bool found;

	WalSenderShmem = (WalSenderShmemStruct *)
		ShmemInitStruct("Wal Sender Data",
						WalSenderShmemSize(),
						&found);
	if (WalSenderShmem == NULL)
		ereport(FATAL,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("not enough shared memory for wal sender")));
	if (found)
		return;					/* already initialized */
	
	MemSet(WalSenderShmem, 0, sizeof(WalSenderShmemStruct));
	WalSenderShmem->max_requests = NUM_AUXILIARY_PROCS + MaxBackends;
}

/*
 * Clean the shmem up.
 *
 * This is called during walsender shutdown.
 */
static void
WalSenderShmemCleanup(int code, Datum arg)
{
	WalSenderShmemStruct *wss = WalSenderShmem;

	LWLockAcquire(WalSenderCommLock, LW_EXCLUSIVE);

	wss->walsender_pid = 0;

	while (wss->num_requests > 0)
		SendProcSignal(wss->requests[--wss->num_requests].pid, 
					   PROCSIGNAL_COMPLETE_REPLICATION);

	LWLockRelease(WalSenderCommLock);
}

/*
 * Add the specified request to the shmem.
 *
 * Returns false if walsender is already dead (i.e. we don't need to add
 * new request any longer), true otherwise.
 */
bool
WalSenderRequestAdd(pid_t pid, bool wait_sending, XLogRecPtr requestLSN)
{
	WalSenderShmemStruct *wss = WalSenderShmem;
	WalSenderRequest *request;

	LWLockAcquire(WalSenderCommLock, LW_EXCLUSIVE);

	if (wss->walsender_pid == 0)
	{
		LWLockRelease(WalSenderCommLock);
		return false;
	}
	
	if (wss->num_requests >= wss->max_requests)
	{
		/* 
		 * No room for new request. (This really shouldn't happen, since 
		 * there is a fixed supply of requests too, and so we should have 
		 * failed earlier.)
		 */
		LWLockRelease(WalSenderCommLock);
		ereport(FATAL,
				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
				 errmsg("sorry, too many requests from backends already")));
	}
	
	request = &wss->requests[wss->num_requests++];
	request->pid = pid;
	request->wait_sending = wait_sending;
	request->requestLSN = requestLSN;

	LWLockRelease(WalSenderCommLock);
	return true;
}

/*
 * Remove the request for a given pid from the shmem.
 */
void
WalSenderRequestRemove(pid_t pid)
{
	WalSenderShmemStruct *wss = WalSenderShmem;
	int i;

	LWLockAcquire(WalSenderCommLock, LW_EXCLUSIVE);

	for (i = 0; i < wss->num_requests; i++)
	{
		if (wss->requests[i].pid == pid)
		{
			wss->requests[i] = wss->requests[--wss->num_requests];
			break;
		}
	}

	LWLockRelease(WalSenderCommLock);
}

/*
 * Complete the request whose conditions were satisfied
 */
static void
WalSenderRequestComplete(void)
{
	WalSenderShmemStruct *wss = WalSenderShmem;
	int curridx;
	int freeidx;
	
	LWLockAcquire(WalSenderCommLock, LW_EXCLUSIVE);
	
	for (curridx = 0, freeidx = 0; 
		 curridx < wss->num_requests; curridx++)
	{
		WalSenderRequest *request = &wss->requests[curridx];
		
		if (XLByteLE(request->requestLSN, request->wait_sending ?
					 LogsndResult.Send : LogsndResult.Recv))
			SendProcSignal(request->pid, PROCSIGNAL_COMPLETE_REPLICATION);
		else if (curridx != freeidx)
			wss->requests[freeidx++] = *request;
	}	
	wss->num_requests = freeidx;
	
	LWLockRelease(WalSenderCommLock);
}


/* --------------------------------
 *		common routines
 * --------------------------------
 */

/*
 * Get PID of walsender
 */
pid_t
GetWalSenderPid(void)
{
	return WalSenderShmem->walsender_pid;
}

/*
 * Check whether walsender is still alive
 */
bool
WalSenderIsAlive(void)
{
	return (WalSenderShmem->walsender_pid != 0);
}

/*
 * Wake up walsender by signaling
 */
void
WalSenderWakeup(void)
{
	if (WalSenderShmem->walsender_pid != 0)
		kill(WalSenderShmem->walsender_pid, SIGUSR1);
}

/*
 * Check for replication timeout. If the timeout has come,
 * we send a timeout-interrupt to walsender.
 *
 * Returns true if the timeout has come, false otherwise.
 */
bool
CheckReplicationTimeout(int elapsed)
{
	if (XLogReplicationTimeout <= 0)
		return false;
	
	if (elapsed >= XLogReplicationTimeout)
	{
		if (WalSenderShmem->walsender_pid != 0)
			kill(WalSenderShmem->walsender_pid, SIGINT);
		
		return true;
	}

	return false;
}
