/*-------------------------------------------------------------------------
 * txid.c
 *
 *	Safe handling of transaction ID's.
 *
 *	Copyright (c) 2003-2004, PostgreSQL Global Development Group
 *	Author: Jan Wieck, Afilias USA INC.
 *
 *	64-bit output: Marko Kreen, Skype Technologies
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <limits.h>

#include "access/xact.h"
#include "access/transam.h"
#include "executor/spi.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "utils/array.h"
#include "utils/lsyscache.h"
#include "funcapi.h"

#ifdef INT64_IS_BUSTED
#error txid needs working int64
#endif

#define MAX_INT64  0x7FFFFFFFFFFFFFFFLL

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

/* Use unsigned variant internally */
typedef uint64 txid;

typedef struct
{
	int32		varsz;
	uint32      nxip;
	txid xmin;
	txid xmax;
	txid xip[1];
}	txid_snapshot;

struct txid_epoch_state {
	uint64			last_value;
	uint64			epoch;
};

/*
 * this caches the txid_epoch table.
 * The struct should be updated only together with the table.
 */
static struct txid_epoch_state epoch_state = { 0, 0 };

/*
 * public functions
 */

PG_FUNCTION_INFO_V1(txid_current);
PG_FUNCTION_INFO_V1(txid_current_xmin);
PG_FUNCTION_INFO_V1(txid_current_xmax);
Datum		txid_current(PG_FUNCTION_ARGS);
Datum		txid_current_xmin(PG_FUNCTION_ARGS);
Datum		txid_current_xmax(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(txid_snapshot_in);
PG_FUNCTION_INFO_V1(txid_snapshot_out);
PG_FUNCTION_INFO_V1(txid_snapshot_recv);
PG_FUNCTION_INFO_V1(txid_snapshot_send);
PG_FUNCTION_INFO_V1(txid_in_snapshot);
PG_FUNCTION_INFO_V1(txid_not_in_snapshot);
PG_FUNCTION_INFO_V1(txid_current_snapshot);
PG_FUNCTION_INFO_V1(txid_snapshot_xmin);
PG_FUNCTION_INFO_V1(txid_snapshot_xmax);
PG_FUNCTION_INFO_V1(txid_snapshot_active);
PG_FUNCTION_INFO_V1(txid_snapshot_from_text);
PG_FUNCTION_INFO_V1(txid_snapshot_to_text);
Datum		txid_snapshot_in(PG_FUNCTION_ARGS);
Datum		txid_snapshot_out(PG_FUNCTION_ARGS);
Datum		txid_snapshot_recv(PG_FUNCTION_ARGS);
Datum		txid_snapshot_send(PG_FUNCTION_ARGS);
Datum		txid_in_snapshot(PG_FUNCTION_ARGS);
Datum		txid_not_in_snapshot(PG_FUNCTION_ARGS);
Datum		txid_current_snapshot(PG_FUNCTION_ARGS);
Datum		txid_snapshot_xmin(PG_FUNCTION_ARGS);
Datum		txid_snapshot_xmax(PG_FUNCTION_ARGS);
Datum       txid_snapshot_active(PG_FUNCTION_ARGS);
Datum       txid_snapshot_from_text(PG_FUNCTION_ARGS);
Datum       txid_snapshot_to_text(PG_FUNCTION_ARGS);

/*
 * do a TransactionId -> txid conversion
 */
static txid convert_xid(TransactionId xid)
{
	uint64 epoch;

	/* avoid issues with the the special meaning of 0 */
	if (xid == InvalidTransactionId)
		return MAX_INT64;

	/* return special xid's as-is */
	if (xid < FirstNormalTransactionId)
		return xid;

	/* xid can on both sides on wrap-around */
	epoch = epoch_state.epoch;
	if (TransactionIdPrecedes(xid, epoch_state.last_value)) {
		if (xid > epoch_state.last_value)
			epoch--;
	} else if (TransactionIdFollows(xid, epoch_state.last_value)) {
		if (xid < epoch_state.last_value)
			epoch++;
	}
	return (epoch << 32) | xid;
}

/*
 * load values from txid_epoch table.
 */
static int load_epoch(void)
{
	HeapTuple row;
	TupleDesc rdesc;
	bool isnull = false;
	Datum tmp;
	int res;
	uint64 db_epoch, db_value;

	res = SPI_connect();
	if (res < 0)
		elog(ERROR, "cannot connect to SPI");

	res = SPI_execute("select epoch, last_value from txid.epoch", true, 0);
	if (res != SPI_OK_SELECT)
		elog(ERROR, "load_epoch: select failed?");
	if (SPI_processed != 1)
		elog(ERROR, "load_epoch: there must be exactly 1 row");

	row = SPI_tuptable->vals[0];
	rdesc = SPI_tuptable->tupdesc;

	tmp = SPI_getbinval(row, rdesc, 1, &isnull);
	if (isnull)
		elog(ERROR, "load_epoch: epoch is NULL");
	db_epoch = DatumGetInt64(tmp);

	tmp = SPI_getbinval(row, rdesc, 2, &isnull);
	if (isnull)
		elog(ERROR, "load_epoch: last_value is NULL");
	db_value = DatumGetInt64(tmp);
	
	SPI_finish();

	/*
	 * If the db has lesser values, then some updates were lost.
	 *
	 * Should that be special-cased?  ATM just use db values.
	 * Thus immidiate update.
	 */
	epoch_state.epoch = db_epoch;
	epoch_state.last_value = db_value;
	return 1;
}

/*
 * updates last_value and epoch, if needed
 */
static void save_epoch(void)
{
	int res;
	char qbuf[200];
	uint64 new_epoch, new_value;
	TransactionId xid = GetTopTransactionId();
	TransactionId old_value;

	/* store old state */
	MemoryContext oldcontext = CurrentMemoryContext;
	ResourceOwner oldowner = CurrentResourceOwner;

	/*
	 * avoid changing internal values.
	 */
	new_value = xid;
	new_epoch = epoch_state.epoch;
	old_value = (TransactionId)epoch_state.last_value;
	if (xid < old_value) {
		if (TransactionIdFollows(xid, old_value))
			new_epoch++;
		else
			return;
	}
	sprintf(qbuf, "update txid.epoch set epoch = %llu, last_value = %llu",
				(unsigned long long)new_epoch,
				(unsigned long long)new_value);

	/*
	 * The update may fail in case of SERIALIZABLE transaction.
	 * Try to catch the error and hide it.
	 */
	BeginInternalSubTransaction(NULL);
	PG_TRY();
	{
		/* do the update */
		res = SPI_connect();
		if (res < 0)
			elog(ERROR, "cannot connect to SPI");
		res = SPI_execute(qbuf, false, 0);
		SPI_finish();

		ReleaseCurrentSubTransaction();
	}
	PG_CATCH();
	{
		/* we expect rollback to clean up inner SPI call */
		RollbackAndReleaseCurrentSubTransaction();
		FlushErrorState();
		res = -1;  /* remember failure */
	}
	PG_END_TRY();

	/* restore old state */
	MemoryContextSwitchTo(oldcontext);
	CurrentResourceOwner = oldowner;

	if (res < 0)
		return;

	/*
	 * Seems the update was successful, update internal state too.
	 *
	 * There is a chance that the TX will be rollbacked, but then
	 * another backend will do the update, or this one at next
	 * checkpoint.
	 */
	epoch_state.epoch = new_epoch;
	epoch_state.last_value = new_value;
}

static void check_epoch(int update_prio)
{
	TransactionId xid = GetTopTransactionId();
	TransactionId recheck, tx_next;
	int ok = 1;

	/* should not happen, but just in case */
	if (xid == InvalidTransactionId)
		return;

	/* new backend */
	if (epoch_state.last_value == 0)
		load_epoch();
	
	/* try to avoid concurrent access */
	if (update_prio)
		recheck = 50000 + 100 * (MyProcPid & 0x1FF);
	else
		recheck = 300000 + 1000 * (MyProcPid & 0x1FF);

	/* read table */
	tx_next = (TransactionId)epoch_state.last_value + recheck;
	if (TransactionIdFollows(xid, tx_next))
		ok = load_epoch();

	/*
	 * check if save is needed.  last_value may be updated above.
	 */
	tx_next = (TransactionId)epoch_state.last_value + recheck;
	if (!ok || TransactionIdFollows(xid, tx_next))
		save_epoch();
}

static int _cmp_txid(const void *aa, const void *bb)
{
	const uint64 *a = aa;
	const uint64 *b = bb;
	if (*a < *b)
		return -1;
	if (*a > *b)
		return 1;
	return 0;
}

static void sort_snapshot(txid_snapshot *snap)
{
	qsort(snap->xip, snap->nxip, sizeof(txid), _cmp_txid);
}

static txid_snapshot *
parse_snapshot(const char *str)
{
	int	a_size;
	txid *xip;

	int			a_used = 0;
	txid		xmin;
	txid		xmax;
	txid		last_val = 0, val;
	txid_snapshot *snap;
	int			size;

	char	   *endp;

	a_size = 1024;
	xip = (txid *) palloc(sizeof(txid) * a_size);

	xmin = (txid) strtoull(str, &endp, 0);
	if (*endp != ':')
		elog(ERROR, "illegal txid_snapshot input format");
	str = endp + 1;

	xmax = (txid) strtoull(str, &endp, 0);
	if (*endp != ':')
		elog(ERROR, "illegal txid_snapshot input format");
	str = endp + 1;

	/* it should look sane */
	if (xmin >= xmax || xmin > MAX_INT64 || xmax > MAX_INT64
			|| xmin == 0 || xmax == 0)
		elog(ERROR, "illegal txid_snapshot input format");

	while (*str != '\0')
	{
		if (a_used >= a_size)
		{
			a_size *= 2;
			xip = (txid *) repalloc(xip, sizeof(txid) * a_size);
		}

		/* read next value */
		if (*str == '\'')
		{
			str++;
			val = (txid) strtoull(str, &endp, 0);
			if (*endp != '\'')
				elog(ERROR, "illegal txid_snapshot input format");
			str = endp + 1;
		}
		else
		{
			val = (txid) strtoull(str, &endp, 0);
			str = endp;
		}

		/* require the input to be in order */
		if (val < xmin || val <= last_val || val >= xmax)
			elog(ERROR, "illegal txid_snapshot input format");
		
		xip[a_used++] = val;
		last_val = val;

		if (*str == ',')
			str++;
		else
		{
			if (*str != '\0')
				elog(ERROR, "illegal txid_snapshot input format");
		}
	}

	size = offsetof(txid_snapshot, xip) + sizeof(txid) * a_used;
	snap = (txid_snapshot *) palloc(size);
	snap->varsz = size;
	snap->xmin = xmin;
	snap->xmax = xmax;
	snap->nxip = a_used;
	if (a_used > 0)
		memcpy(&(snap->xip[0]), xip, sizeof(txid) * a_used);
	pfree(xip);

	return snap;
}
/*
 *		txid_current	- Return the current transaction ID as txid
 */
Datum
txid_current(PG_FUNCTION_ARGS)
{
	txid val;

	check_epoch(0);

	val = convert_xid(GetTopTransactionId());
	PG_RETURN_INT64(val);
}


/*
 *		txid_current_xmin	- Return the minxid from the current snapshot
 */
Datum
txid_current_xmin(PG_FUNCTION_ARGS)
{
	txid val;
	if (SerializableSnapshot == NULL)
		elog(ERROR, "SerializableSnapshot is NULL in txid_current_xmin()");

	check_epoch(0);

	val = convert_xid(SerializableSnapshot->xmin);
	PG_RETURN_INT64(val);
}


/*
 *		txid_current_xmax	- Return the max_txid from the current snapshot
 */
Datum
txid_current_xmax(PG_FUNCTION_ARGS)
{
	txid val;
	if (SerializableSnapshot == NULL)
		elog(ERROR, "SerializableSnapshot is NULL in txid_current_xmax()");

	check_epoch(0);

	val = convert_xid(SerializableSnapshot->xmax);
	PG_RETURN_INT64(val);
}


/*
 *		txid_snapshot_in	- input function for type txid_snapshot
 */
Datum
txid_snapshot_in(PG_FUNCTION_ARGS)
{
	txid_snapshot *snap;
	char	   *str = PG_GETARG_CSTRING(0);

	snap = parse_snapshot(str);
	PG_RETURN_POINTER(snap);
}

/*
 *		txid_snapshot_out	- output function for type txid_snapshot
 */
Datum
txid_snapshot_out(PG_FUNCTION_ARGS)
{
	txid_snapshot *snap = (txid_snapshot *) PG_GETARG_VARLENA_P(0);

	char	   *str = palloc(60 + snap->nxip * 30);
	char	   *cp = str;
	int			i;

	snprintf(str, 60, "%llu:%llu:",
			(unsigned long long)snap->xmin,
			(unsigned long long)snap->xmax);
	cp = str + strlen(str);

	for (i = 0; i < snap->nxip; i++)
	{
		snprintf(cp, 30, "%llu%s",
				(unsigned long long)snap->xip[i],
				 (i < snap->nxip - 1) ? "," : "");
		cp += strlen(cp);
	}

	PG_RETURN_CSTRING(str);
}

/*
 *		txid_snapshot_from_text	- convert text to txid_snapshot
 */
Datum
txid_snapshot_from_text(PG_FUNCTION_ARGS)
{
	text		*txt = PG_GETARG_TEXT_P(0);
	txid_snapshot *snap;
	char	   *str;
	int			len;

	len = VARSIZE(txt) - VARHDRSZ;
	str = palloc(len + 1);
	memcpy(str, VARDATA(txt), len);
	str[len] = 0;

	snap = parse_snapshot(str);
	pfree(str);
	PG_RETURN_POINTER(snap);
}

/*
 *		txid_snapshot_to_text	- convert txid_snapshot to text
 */
Datum
txid_snapshot_to_text(PG_FUNCTION_ARGS)
{
	txid_snapshot *snap = (txid_snapshot *) PG_GETARG_VARLENA_P(0);

	text	   *res = palloc(VARHDRSZ + 60 + snap->nxip * 30);
	char	   *str = VARDATA(res);
	char	   *cp = str;
	int			i;

	snprintf(str, 60, "%llu:%llu:",
			(unsigned long long)snap->xmin,
			(unsigned long long)snap->xmax);
	cp = str + strlen(str);

	for (i = 0; i < snap->nxip; i++)
	{
		snprintf(cp, 30, "%llu%s",
				(unsigned long long)snap->xip[i],
				 (i < snap->nxip - 1) ? "," : "");
		cp += strlen(cp);
	}

	VARATT_SIZEP(res) = VARHDRSZ + cp - str;
	PG_RETURN_TEXT_P(res);
}

/*
 *		txid_snapshot_recv	- binary read
 */
Datum
txid_snapshot_recv(PG_FUNCTION_ARGS)
{
	StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
	txid_snapshot *snap;
	unsigned i, count, size;
	txid val;

	count = pq_getmsgint(buf, 4);
	size = offsetof(txid_snapshot, xip) + sizeof(txid) * count;
	snap = palloc(size);

	snap->varsz = size;
	snap->nxip = count;
	snap->xmin = pq_getmsgint64(buf);
	snap->xmax = pq_getmsgint64(buf);
	val = snap->xmin;
	for (i = 0; i < count; i++) {
		unsigned delta;
		delta = pq_getmsgint(buf, 2);
		if (delta & 0x8000)
			val += ((delta & 0x7FFF) << 16) + pq_getmsgint(buf, 2);
		else
			val += delta;
		snap->xip[i] = val;
	}
	PG_RETURN_POINTER(snap);
}

/*
 *		txid_snapshot_send	- binary storage
 */
Datum
txid_snapshot_send(PG_FUNCTION_ARGS)
{
	int i;
	txid_snapshot *snap = (txid_snapshot *) PG_GETARG_VARLENA_P(0);
	StringInfoData buf;
	txid val;

	pq_begintypsend(&buf);
	pq_sendint(&buf, snap->nxip, 4);
	pq_sendint64(&buf, snap->xmin);
	pq_sendint64(&buf, snap->xmax);
	val = snap->xmin;
	for (i = 0; i < snap->nxip; i++) {
		txid tdelta = snap->xip[i] - val;
		uint32 delta = tdelta;
		if (tdelta >= (1LL << 32))
			elog(ERROR, "overflow in txid_snapshot_send");

		if (delta > 0x7FFF) {
			pq_sendint(&buf, (delta >> 16) | 0x8000, 2);
			pq_sendint(&buf, delta & 0xFFFF, 2);
		} else {
			pq_sendint(&buf, delta, 2);
		}
		val = snap->xip[i];
	}
	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}


/*
 * txid_in_snapshot	- is txid visible in snapshot ?
 */
Datum
txid_in_snapshot(PG_FUNCTION_ARGS)
{
	txid value = PG_GETARG_INT64(0);
	txid_snapshot *snap = (txid_snapshot *) PG_GETARG_VARLENA_P(1);
	int			i;

	if (value < snap->xmin)
		PG_RETURN_BOOL(true);

	if (value >= snap->xmax)
		PG_RETURN_BOOL(false);

	for (i = 0; i < snap->nxip; i++)
	{
		if (value == snap->xip[i])
			PG_RETURN_BOOL(false);
	}

	PG_RETURN_BOOL(true);
}


/*
 * txid_not_in_snapshot	- is txid invisible in snapshot ?
 */
Datum
txid_not_in_snapshot(PG_FUNCTION_ARGS)
{
	txid		value = PG_GETARG_INT64(0);
	txid_snapshot *snap = (txid_snapshot *) PG_GETARG_VARLENA_P(1);
	int			i;

	if (value < snap->xmin)
		PG_RETURN_BOOL(false);

	if (value >= snap->xmax)
		PG_RETURN_BOOL(true);

	for (i = 0; i < snap->nxip; i++)
	{
		if (value == snap->xip[i])
			PG_RETURN_BOOL(true);
	}

	PG_RETURN_BOOL(false);
}

/*
 * Prepare current snapshot
 */
static txid_snapshot *create_snapshot(void)
{
	txid_snapshot *snap;
	unsigned num, i, size;

	if (SerializableSnapshot == NULL)
		elog(ERROR, "get_current_snapshot: SerializableSnapshot == NULL");

	check_epoch(1);

	num = SerializableSnapshot->xcnt;
	size = offsetof(txid_snapshot, xip) + sizeof(txid) * num;
	snap = palloc(size);
	snap->varsz = size;
	snap->xmin = convert_xid(SerializableSnapshot->xmin);
	snap->xmax = convert_xid(SerializableSnapshot->xmax);
	snap->nxip = num;
	for (i = 0; i < num; i++)
		snap->xip[i] = convert_xid(SerializableSnapshot->xip[i]);

	/* we want then guaranteed ascending order */
	sort_snapshot(snap);

	return snap;
}

/*
 * txid_current_snapshot	-	return current snapshot
 */
Datum
txid_current_snapshot(PG_FUNCTION_ARGS)
{
	txid_snapshot *snap;

	snap = create_snapshot();

	PG_RETURN_POINTER(snap);
}

/*
 * txid_snapshot_xmin	-	return snapshot's xmin
 */
Datum
txid_snapshot_xmin(PG_FUNCTION_ARGS)
{
	txid_snapshot *snap = (txid_snapshot *) PG_GETARG_VARLENA_P(0);

	PG_RETURN_INT64(snap->xmin);
}

/*
 * txid_snapshot_xmin	-	return snapshot's xmax
 */
Datum
txid_snapshot_xmax(PG_FUNCTION_ARGS)
{
	txid_snapshot *snap = (txid_snapshot *) PG_GETARG_VARLENA_P(0);

	PG_RETURN_INT64(snap->xmax);
}

/* remember state between function calls */
struct snap_state {
	int pos;
	txid_snapshot *snap;
};

/*
 * txid_snapshot_active		- returns uncommitted TXID's in snapshot.
 */
Datum
txid_snapshot_active(PG_FUNCTION_ARGS)
{
	FuncCallContext *fctx;
	struct snap_state *state;

	if (SRF_IS_FIRSTCALL()) {
		txid_snapshot *snap;
		int statelen;
		int local = 0;

		if (PG_NARGS() == 0) {
			snap = create_snapshot();
			local = 1;
		} else
			snap = (txid_snapshot *) PG_GETARG_VARLENA_P(0);
		
		fctx = SRF_FIRSTCALL_INIT();
		statelen = sizeof(*state) + snap->varsz;
		state = MemoryContextAlloc(fctx->multi_call_memory_ctx, statelen);
		state->pos = 0;
		state->snap = (txid_snapshot *)((char *)state + sizeof(*state));
		memcpy(state->snap, snap, snap->varsz);
		fctx->user_fctx = state;

		if (local)
			pfree(snap);
	}
	fctx = SRF_PERCALL_SETUP();
	state = fctx->user_fctx;
	if (state->pos < state->snap->nxip) {
		Datum res = Int64GetDatum(state->snap->xip[state->pos]);
		state->pos++;
		SRF_RETURN_NEXT(fctx, res);
	} else {
		SRF_RETURN_DONE(fctx);
	}
}

