/*-------------------------------------------------------------------------
 *
 * nodeWindow.c
 *    TODO: comment
 *
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * Window node evaluates only window aggref expression. Since WinAggref is 
 * conceptually derived from Aggref node, Window node resembles Agg node.
 * Different from Agg node, Window node considers two row sets, Partition
 * and Frame. Also, contrast to Agg node Window has two key tuples, Partition
 * and Order, which is close to ranking system.
 *
 * Currently Window node assume its input is sorted appropriately so it doesn't 
 * care sort operation.
 *
 * A window function is defined as a function that may or may not have a transient
 * function and a *volatile* final function. Window node will call final function
 * per tuple if it is volatile and returns its result as the current row result.
 * A window aggregate is exactly same as a group aggregate. Since Window node
 * returns multiple rows for a group (c.f. a frame in Window node), the node 
 * caches its result to avoid multiple calls of final function of aggregate.
 * If there are only window function and no window aggregate, the node avoids
 * frame rescan for aggregate trans function, so it handles only initialization
 * and finalization. As a frame rescan gets considerably high cost, this 
 * optimization is critical in a large frame situation.
 *
 * IDENTIFICATION
 *	  $Id$
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "catalog/pg_aggregate.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
#include "executor/executor.h"
#include "executor/nodeWindow.h"
#include "miscadmin.h"
#include "optimizer/clauses.h"
#include "parser/parse_agg.h"
#include "parser/parse_coerce.h"
#include "parser/parse_expr.h"
#include "parser/parse_oper.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/syscache.h"
#include "utils/tuplestore.h"
#include "utils/datum.h"


typedef struct WindowStatePerAggData
{
	/* Links to WinAggref expr and state nodes this working state is for */
	WinAggrefExprState *winaggstate;
	WinAggref	   *winagg;

	/* number of input arguments for aggregate */
	int			numArguments;

	/* Oids of transfer functions */
	Oid			transfn_oid;
	Oid			finalfn_oid;	/* may be InvalidOid */

	/*
	 * fmgr lookup data for transfer functions --- only valid when
	 * corresponding oid is not InvalidOid.  Note in particular that fn_strict
	 * flags are kept here.
	 */
	FmgrInfo	transfn;
	FmgrInfo	finalfn;

	bool		finalfn_volatile;		/* volatility of final function */

	/*
	 * initial value from pg_aggregate entry
	 */
	Datum		initValue;
	bool		initValueIsNull;

	/*
	 * We need the len and byval info for the agg's input, result, and
	 * transition data types in order to know how to copy/delete values.
	 */
	int16		inputtypeLen,
				resulttypeLen,
				transtypeLen;
	bool		inputtypeByVal,
				resulttypeByVal,
				transtypeByVal;
	/* DISTINCT argument is not supported for window function so far */
	
	/*
	 * PerGroupData is here, included in PerAggData, since we do not 
	 * support hash strategy so far.
	 */
	Datum		transValue;		/* current transition value */
	bool		transValueIsNull;

	bool		noTransValue;	/* true if transValue not set yet */

	Datum		prevValue;
	bool		prevValueIsNull;
	bool		prevCached;
	void	   *frameContext;
} WindowStatePerAggData;

/*
 * ranking process information
 */
typedef struct
{
	int64		rank;		/* depending on what type of rank is requested */
	int64		rowcount;	/* total seen row numbers */
	HeapTuple	heaptuple;	/* current rank tuple usually */
} rank_context;

/*
 * ntile process information
 */
typedef struct
{
	int64		ntile;		/* current result */
	int64		nrows;		/* row number of current bucket */
	int64		boundary;
	int64		remainder;
} ntile_context;

static void win_initialize_aggregates(WindowState *winstate);
static void win_advance_transition_function(WindowState *winstate, 
											WindowStatePerAgg peraggstate,
											FunctionCallInfoData *fcinf);
static void win_advance_aggregates(WindowState *winstate);
static void win_finalize_aggregate(WindowState *winstate, WindowStatePerAgg peraggstate,
								Datum *resultVal, bool *resultIsNull);
static void store_partition(WindowState *winstate);
static TupleTableSlot *process_frame(WindowState *winstate);

static WindowState *getWindowState(FunctionCallInfo fcinfo);
static bool rank_up(FunctionCallInfo fcinfo);

/*
 * frame_start -
 *
 * initialize frame information. Currently a partition and a frame indicate
 * almost same meanings, but are conceptually different.
 * Return value means whether a new frame is created or not.
 */
static bool
frame_start(WindowState *winstate)
{
	int numaggs;
	int i;

	if (winstate->frame_processing)
		return false;
	
	numaggs = winstate->numaggs;
	
	for(i = 0; i < numaggs; i++)
	{
		WindowStatePerAgg		peraggstate = &winstate->peragg[i];
		
		peraggstate->prevValueIsNull = true;
		peraggstate->prevCached = false;
		peraggstate->frameContext = NULL;
		if (peraggstate->prevCached && peraggstate->prevValueIsNull &&
			!peraggstate->resulttypeByVal)
		{
			pfree(DatumGetPointer(peraggstate->prevValue));
		}
	}
	
	winstate->frame_processing = true;
	return true;
}

/*
 * frame_finish -
 *
 * finishes frame.
 */
static void
frame_finish(WindowState *winstate)
{
	winstate->partition_processing = false;
	winstate->frame_processing = false;
}


/*
 * initialize_aggregate -
 *
 * Initialize all aggregates for a new group of input values.
 *
 * When called, CurrentMemoryContext should be the per-query context.
 */
static void
win_initialize_aggregates(WindowState *winstate)
{
	int			aggno;

	for (aggno = 0; aggno < winstate->numaggs; aggno++)
	{
		WindowStatePerAgg peraggstate = &winstate->peragg[aggno];

		/*
		 * If we are reinitializing after a group boundary, we have to free
		 * any prior transValue to avoid memory leakage.  We must check not
		 * only the isnull flag but whether the pointer is NULL;
		 */
		if (!peraggstate->transtypeByVal &&
			!peraggstate->transValueIsNull &&
			DatumGetPointer(peraggstate->transValue) != NULL)
			pfree(DatumGetPointer(peraggstate->transValue));

		/*
		 * (Re)set transValue to the initial value.
		 *
		 * Note that when the initial value is pass-by-ref, we must copy it
		 * (into the aggcontext) since we will pfree the transValue later.
		 */
		if (peraggstate->initValueIsNull)
			peraggstate->transValue = peraggstate->initValue;
		else
		{
			MemoryContext oldContext;

			oldContext = MemoryContextSwitchTo(winstate->wincontext);
			peraggstate->transValue = datumCopy(peraggstate->initValue,
												  peraggstate->transtypeByVal,
												  peraggstate->transtypeLen);
			MemoryContextSwitchTo(oldContext);
		}
		peraggstate->transValueIsNull = peraggstate->initValueIsNull;

		/*
		 * If the initial value for the transition state doesn't exist in the
		 * pg_aggregate table then we will let the first non-NULL value
		 * returned from the outer procNode become the initial value. (This is
		 * useful for aggregates like max() and min().) The noTransValue flag
		 * signals that we still need to do this.
		 */
		peraggstate->noTransValue = peraggstate->initValueIsNull;
	}
}

/*
 * advance_transition_function -
 *   almost same as the same name function in nodeAgg.c
 */
static void 
win_advance_transition_function(WindowState *winstate,
							WindowStatePerAgg peraggstate,
							FunctionCallInfoData *fcinfo)
{
	int			numArguments = peraggstate->numArguments;
	MemoryContext oldContext;
	Datum		newVal;
	int			i;
	
	if (peraggstate->transfn.fn_strict)
	{
		/*
		 * For a strict transfn, nothing happens when there's a NULL input; we
		 * just keep the prior transValue.
		 */
		for(i = 1; i <= numArguments; i++)
		{
			if (fcinfo->argnull[i])
				return;
		}

		if (peraggstate->noTransValue)
		{
			/*
			 * transValue has not been initialized. This is the first non-NULL
			 * input value. We use it as the initial value for transValue. (We
			 * already checked that the agg's input type is binary-compatible
			 * with its transtype, so straight copy here is OK.)
			 *
			 * We must copy the datum into aggcontext if it is pass-by-ref. We
			 * do not need to pfree the old transValue, since it's NULL.
			 */
			oldContext = MemoryContextSwitchTo(winstate->wincontext);
			peraggstate->transValue = datumCopy(fcinfo->arg[1],
										   peraggstate->transtypeByVal,
										   peraggstate->transtypeLen);
			peraggstate->transValueIsNull = false;
			peraggstate->noTransValue = false;
			MemoryContextSwitchTo(oldContext);
			return;
		}

		if (peraggstate->transValueIsNull)
		{
			/*
			 * Don't call a strict function with NULL inputs.  Note it is
			 * possible to get here despite the above tests, if the transfn is
			 * strict *and* returned a NULL on a prior cycle. If that happens
			 * we will propagate the NULL all the way to the end.
			 */
			return;
		}
	}

	/* We run the transition functions in per-input-tuple memory context */
	oldContext = MemoryContextSwitchTo(winstate->tmpcontext->ecxt_per_tuple_memory);

	/*
	 * OK to call the transition function
	 */
	InitFunctionCallInfoData(*fcinfo, &(peraggstate->transfn),
							 numArguments + 1,
							 (void *) winstate, NULL);
	fcinfo->arg[0] = peraggstate->transValue;
	fcinfo->argnull[0] = peraggstate->transValueIsNull;

	newVal = FunctionCallInvoke(fcinfo);

	/*
	 * If pass-by-ref datatype, must copy the new value into aggcontext and
	 * pfree the prior transValue.	But if transfn returned a pointer to its
	 * first input, we don't need to do anything.
	 */
	if (!peraggstate->transtypeByVal &&
		DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue))
	{
		if (!fcinfo->isnull)
		{
			MemoryContextSwitchTo(winstate->wincontext);
			newVal = datumCopy(newVal,
							   peraggstate->transtypeByVal,
							   peraggstate->transtypeLen);
		}
		if (!peraggstate->transValueIsNull)
			pfree(DatumGetPointer(peraggstate->transValue));
	}

	peraggstate->transValue = newVal;
	peraggstate->transValueIsNull = fcinfo->isnull;

	MemoryContextSwitchTo(oldContext);
}

/*
 * advance_aggregate -
 */
static void
win_advance_aggregates(WindowState *winstate)
{
	ExprContext *econtext = winstate->tmpcontext;
	int			aggno;

	for (aggno = 0; aggno < winstate->numaggs; aggno++)
	{
		WindowStatePerAgg	peraggstate = &winstate->peragg[aggno];
		WinAggrefExprState *winaggstate = peraggstate->winaggstate;
		FunctionCallInfoData fcinfo;
		int			i;
		ListCell   *arg;
		MemoryContext oldContext;

		if (!OidIsValid(peraggstate->transfn_oid))
			continue;
		
		/* Switch memory context just once for all args */
		oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);

		/* Evaluate inputs and save in fcinfo */
		/* We start from 1, since the 0th arg will be the transition value */
		i = 1;
		foreach(arg, winaggstate->args)
		{
			ExprState  *argstate = (ExprState *) lfirst(arg);

			fcinfo.arg[i] = ExecEvalExpr(argstate, econtext,
										 fcinfo.argnull + i, NULL);
			i++;
		}

		/* Switch back */
		MemoryContextSwitchTo(oldContext);

		win_advance_transition_function(winstate, peraggstate, &fcinfo);
	}
}

/*
 * Compute the final value of one aggregate.
 *
 * The finalfunction will be run, and the result delivered, in the
 * output-tuple context; caller's CurrentMemoryContext does not matter.
 */
static void
win_finalize_aggregate(WindowState *winstate,
				   WindowStatePerAgg peraggstate,
				   Datum *resultVal, bool *resultIsNull)
{
	MemoryContext oldContext;

	oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);

	/*
	 * Apply the agg's finalfn if one is provided, else return transValue.
	 */
	if (OidIsValid(peraggstate->finalfn_oid))
	{
		FunctionCallInfoData fcinfo;

		InitFunctionCallInfoData(fcinfo, &(peraggstate->finalfn), 1,
								 (void *) winstate, NULL);
		fcinfo.arg[0] = peraggstate->transValue;
		fcinfo.argnull[0] = peraggstate->transValueIsNull;
		if (fcinfo.flinfo->fn_strict && peraggstate->transValueIsNull)
		{
			/* don't call a strict function with NULL inputs */
			*resultVal = (Datum) 0;
			*resultIsNull = true;
		}
		else
		{
			fcinfo.flinfo->fn_extra = peraggstate->frameContext;
			*resultVal = FunctionCallInvoke(&fcinfo);
			*resultIsNull = fcinfo.isnull;
			/*
			 * finalfn of window function uses this pointer to
			 * keep information.
			 */
			peraggstate->frameContext = fcinfo.flinfo->fn_extra;
		}
	}
	else
	{
		*resultVal = peraggstate->transValue;
		*resultIsNull = peraggstate->transValueIsNull;
	}

	if (!peraggstate->resulttypeByVal && !*resultIsNull &&
		!MemoryContextContains(CurrentMemoryContext,
							   DatumGetPointer(*resultVal)))
		*resultVal = datumCopy(*resultVal,
							   peraggstate->resulttypeByVal,
							   peraggstate->resulttypeLen);

	MemoryContextSwitchTo(oldContext);
}

/*
 * process_frame -
 *
 * A frame is a set of rows and a unit of aggregate process that is attached with
 * current row. When new frame is created, we need to compute aggregate again.
 * Scan through the frame is necessary for aggregate, although not necessary
 * when there is no aggregate and only window function.
 * This routine returns final window result.
 */
static TupleTableSlot *
process_frame(WindowState *winstate)
{
	ExprContext *econtext;
	ExprContext *tmpcontext;
	ProjectionInfo *projInfo;
	bool			forward;
	Tuplestorestate *ts_partition;
	TupleTableSlot *scanslot;
	TupleTableSlot *currentslot;
	int numaggs = winstate->numaggs;
	int i;

	currentslot = winstate->currentslot;
	tmpcontext = winstate->tmpcontext;
	scanslot = winstate->ss.ss_ScanTupleSlot;
	econtext = winstate->ss.ps.ps_ExprContext;
	forward = ScanDirectionIsForward(winstate->ss.ps.state->es_direction);
	ts_partition = (Tuplestorestate *) winstate->ts_partition;

	/* if first time through, initialize currentslot by cloning input slot */
	if (currentslot->tts_tupleDescriptor == NULL)
	{
		ExecSetSlotDescriptor(currentslot, scanslot->tts_tupleDescriptor);
		ExecStoreAllNullTuple(currentslot);
	}

	/*
	 * fetch CURRENT ROW
	 */
	tuplestore_restorepos(ts_partition);

	if (!tuplestore_gettupleslot(ts_partition, forward, currentslot))
	{
		frame_finish(winstate);
		if (!winstate->win_done)
			store_partition(winstate);
		else
			return NULL;
		if (!tuplestore_gettupleslot(ts_partition, forward, currentslot))
		{
			frame_finish(winstate);
			return NULL;
		}
	}
	tuplestore_markpos(ts_partition);

	/*
	 * By scanning CURRENT ROW, frame may be changed. If frame is new, 
	 * aggregate is restarted. Otherwise, compute final result using
	 * trans value output and cached aggregate result.
	 */
	if (frame_start(winstate))
	{
		/*
		 * a new frame created. Start aggregate from init.
		 */
		win_initialize_aggregates(winstate);

		/*
		 * If all functions don't have transient function,
		 * just skip.
		 */
		if (winstate->need_aggregate)
		{
			tuplestore_rescan(ts_partition);
			for(;;)
			{
				if (!tuplestore_gettupleslot(ts_partition, forward, scanslot))
					break;

				/* set up for advance_aggregates call */
				tmpcontext->ecxt_outertuple = scanslot;
				win_advance_aggregates(winstate);
			}
		}
	}

	for(i = 0; i < numaggs; i++)
	{
		Datum	   *aggvalue = &econtext->ecxt_winaggvalues[i];
		bool	   *aggnull = &econtext->ecxt_winaggnulls[i];
		WindowStatePerAgg peraggstate = &winstate->peragg[i];
		
		if (!peraggstate->prevCached ||
			peraggstate->finalfn_volatile)
		{
			/*
			 * case 1: IMMUTABLE
			 * normal aggregate function. if we don't have 
			 * final agg result, call final func once and store it.
			 *
			 * case 2: VOLATILE
			 * windowed function. 
			 */
			win_finalize_aggregate(winstate, peraggstate,
							   aggvalue, aggnull);

			if (!peraggstate->finalfn_volatile)
			{
				if (!*aggnull)
				{
					MemoryContext oldContext;
					oldContext = MemoryContextSwitchTo(winstate->wincontext);
					peraggstate->prevValue = datumCopy(*aggvalue,
													   peraggstate->resulttypeByVal,
													   peraggstate->resulttypeLen);
					MemoryContextSwitchTo(oldContext);
				}
				peraggstate->prevValueIsNull = *aggnull;
				peraggstate->prevCached = true;
			}
		}
		else
		{
			/* restore the cache */
			if (peraggstate->prevValueIsNull)
			{
				*aggvalue = (Datum) 0;
				*aggnull = true;
			}
			else
			{
				*aggvalue = datumCopy(peraggstate->prevValue,
									  peraggstate->resulttypeByVal,
									  peraggstate->resulttypeLen);
				*aggnull = false;
			}
		}
	}
	
	projInfo = winstate->ss.ps.ps_ProjInfo;

	/* Vars in Window all refer to OUTER, which equals CURRENT ROW */
	econtext->ecxt_outertuple = currentslot;

	return ExecProject(projInfo, NULL);
}

/*
 * store_partition
 * 
 * Since frame scrolls in/out and current row is re-scanned, we need to 
 * store whole the partition rows for later random access. This process 
 * is likely to be similar to grouping in nodeAgg.c
 */
static void 
store_partition(WindowState *winstate)
{
	Window		   *node = (Window *) winstate->ss.ps.plan;
	PlanState	   *outerPlan;
	ExprContext	   *econtext;
	ExprContext	   *tmpcontext;
	TupleTableSlot *outerslot;
	TupleTableSlot *firstSlot;
	Tuplestorestate *ts_partition;

	outerPlan = outerPlanState(winstate);
	econtext = winstate->ss.ps.ps_ExprContext;
	tmpcontext = winstate->tmpcontext;
	firstSlot = winstate->ss.ss_ScanTupleSlot;
	ts_partition = (Tuplestorestate *) winstate->ts_partition;

	if (winstate->prt_firstTuple == NULL)
	{
		outerslot = ExecProcNode(outerPlan);
		if (!TupIsNull(outerslot))
		{
			winstate->prt_firstTuple = ExecCopySlotTuple(outerslot);
		}
		else
		{
			winstate->win_done = true;
			return;
		}
	}

	if (winstate->prt_firstTuple != NULL)
	{
		ExecStoreTuple(winstate->prt_firstTuple,
					   firstSlot,
					   InvalidBuffer,
					   true);
		winstate->prt_firstTuple = NULL;

		tmpcontext->ecxt_outertuple = firstSlot;

		if(ts_partition != NULL)
		{
			/*
			 * need consider, for this code forgets about final process.
			 */
			tuplestore_end(ts_partition);
		}
		ts_partition = tuplestore_begin_heap(true, false, work_mem);
		tuplestore_set_eflags(ts_partition, 
							 (EXEC_FLAG_REWIND | 
							  EXEC_FLAG_BACKWARD | 
							  EXEC_FLAG_MARK));
		winstate->ts_partition = (void *) ts_partition;

		for(;;)
		{
			tuplestore_puttupleslot(ts_partition, tmpcontext->ecxt_outertuple);

			outerslot = ExecProcNode(outerPlan);
			if (TupIsNull(outerslot))
			{
				winstate->win_done = true;
				break;
			}

			tmpcontext->ecxt_outertuple = outerslot;

			if (!execTuplesMatch(firstSlot,
								outerslot,
								node->prtNumCols, node->prtColIdx,
								winstate->prtEqfunctions,
								tmpcontext->ecxt_per_tuple_memory))
			{
				winstate->prt_firstTuple = ExecCopySlotTuple(outerslot);
				break;
			}
		}
	}

	winstate->partition_processing = true;
}

/* -----------------
 * ExecWindow 
 *
 *	Window node execution proceeds as:
 *		1. store partition
 *		2. fetch current row
 *		3. create a frame if the previous frame has finished.
 *		4. aggregate (trans funcs) if any or if new frame comes.
 *		5. finalize / recompute result using trans value and context
 *		6. go to 2. if current partition hasn't finished, to 1. otherwise.
 * -----------------
 */
TupleTableSlot *
ExecWindow(WindowState *winstate)
{
	if (winstate->win_done && !winstate->partition_processing)
		return NULL;

	if (!winstate->partition_processing)
		store_partition(winstate);

	return process_frame(winstate);
}

static Datum
GetAggInitVal(Datum textInitVal, Oid transtype)
{
	Oid			typinput,
				typioparam;
	char	   *strInitVal;
	Datum		initVal;

	getTypeInputInfo(transtype, &typinput, &typioparam);
	strInitVal = TextDatumGetCString(textInitVal);
	initVal = OidInputFunctionCall(typinput, strInitVal,
								   typioparam, -1);
	pfree(strInitVal);
	return initVal;
}

/* -----------------
 * ExecInitWindow 
 *
 *	Window node uses an extra TupleTableSlot for current row.
 *	Window aggregate function is as defined in the grouping aggregate,
 *	the initialization of those functions is almost same, contrast to
 *	that the other part of functions, window functions, are formed as 
 *	aggregate function (i.e. pg_aggregate holds their information) but 
 *	have volatile final function. Also, the trans function may be null
 *	with these functions.
 * -----------------
 */
WindowState *
ExecInitWindow(Window *node, EState *estate, int eflags)
{
	WindowState *winstate;
	Plan	   *outerPlan;
	ExprContext *econtext;
	WindowStatePerAgg peragg;
	int			numaggs = 0, aggno;
	bool		need_aggregate;
	ListCell   *l;

	winstate = makeNode(WindowState);
	winstate->ss.ps.plan = (Plan *) node;
	winstate->ss.ps.state = estate;
	winstate->eflags = (eflags & 
		(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK));

	/*
	 * Create expression contexts.	We need two, one for per-input-tuple
	 * processing and one for per-output-tuple processing.	We cheat a little
	 * by using ExecAssignExprContext() to build both.
	 */
	ExecAssignExprContext(estate, &winstate->ss.ps);
	winstate->tmpcontext = winstate->ss.ps.ps_ExprContext;
	ExecAssignExprContext(estate, &winstate->ss.ps);

	winstate->wincontext =
		AllocSetContextCreate(CurrentMemoryContext,
							  "WinContext",
							  ALLOCSET_DEFAULT_MINSIZE,
							  ALLOCSET_DEFAULT_INITSIZE,
							  ALLOCSET_DEFAULT_MAXSIZE);

#define WINDOW_NSLOTS 3

	/*
	 * tuple table initialization
	 */
	ExecInitScanTupleSlot(estate, &winstate->ss);
	ExecInitResultTupleSlot(estate, &winstate->ss.ps);
	winstate->currentslot = ExecInitExtraTupleSlot(estate);

	winstate->ss.ps.targetlist = (List *)
		ExecInitExpr((Expr *) node->plan.targetlist,
					 (PlanState *) winstate);
	winstate->ss.ps.qual = NIL;

	/*
	 * initialize child nodes
	 *
	 * We shield the child node from the need to support REWIND, BACKWARD, or
	 * MARK/RESTORE.
	 */
	eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
	outerPlan = outerPlan(node);
	outerPlanState(winstate) = ExecInitNode(outerPlan, estate, eflags);

	/*
	 * initialize result tuple type and projection info.
	 */
	ExecAssignScanTypeFromOuterPlan(&winstate->ss);

	/*
	 * initialize result tuple type and projection info.
	 */
	ExecAssignResultTypeFromTL(&winstate->ss.ps);
	ExecAssignProjectionInfo(&winstate->ss.ps, NULL);

	numaggs = winstate->numaggs;
	if (node->prtNumCols > 0)
		winstate->prtEqfunctions = execTuplesMatchPrepare(node->prtNumCols, 
														  node->prtOperators);
	if (node->ordNumCols > 0)
		winstate->ordEqfunctions = execTuplesMatchPrepare(node->ordNumCols,
														  node->ordOperators);

	econtext = winstate->ss.ps.ps_ExprContext;
	econtext->ecxt_winaggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
	econtext->ecxt_winaggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
	
	peragg = (WindowStatePerAgg) palloc0(sizeof(WindowStatePerAggData) * numaggs);
	winstate->peragg = peragg;
	
	aggno = -1;
	need_aggregate = false;
	foreach(l, winstate->aggs)
	{
		WinAggrefExprState *winaggstate = (WinAggrefExprState *) lfirst(l);
		WinAggref		   *winagg = (WinAggref *) winaggstate->xprstate.expr;
		WindowStatePerAgg peraggstate;
		Oid			inputTypes[FUNC_MAX_ARGS];
		int			numArguments;
		HeapTuple	aggTuple;
		Form_pg_aggregate aggform;
		Oid			aggtranstype;
		AclResult	aclresult;
		Oid			transfn_oid;
		Oid			finalfn_oid;
		Expr	   *transfnexpr;
		Expr	   *finalfnexpr;
		Datum		textInitVal;
		int			i;
		ListCell   *lc;
		
		Assert(winagg->agglevelsup == 0);
		
		/* Look for a previous duplicate aggregate */
		for (i = 0; i <= aggno; i++)
		{
			if (equal(winagg, peragg[i].winagg) &&
				!contain_volatile_functions((Node *) winagg))
				break;
		}
		if (i <= aggno)
		{
			/* Found a match to an existing entry, so just mark it */
			winaggstate->aggno = i;
			continue;
		}

		/* Nope, so assign a new PerAgg record */
		peraggstate = &peragg[++aggno];

		/* Mark WinAggref state node with assigned index in the result array */
		winaggstate->aggno = aggno;

		/* Fill in the peraggstate data */
		peraggstate->winaggstate = winaggstate;
		peraggstate->winagg = winagg;
		numArguments = list_length(winagg->args);
		peraggstate->numArguments = numArguments;

		/*
		 * Get actual datatypes of the inputs.	These could be different from
		 * the agg's declared input types, when the agg accepts ANY or a
		 * polymorphic type.
		 */
		i = 0;
		foreach(lc, winagg->args)
		{
			inputTypes[i++] = exprType((Node *) lfirst(lc));
		}

		aggTuple = SearchSysCache(AGGFNOID,
								  ObjectIdGetDatum(winagg->aggfnoid),
								  0, 0, 0);
		if (!HeapTupleIsValid(aggTuple))
			elog(ERROR, "cache lookup failed for window aggregate %u",
				 winagg->aggfnoid);
		aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);

		/* Check permission to call aggregate function */
		aclresult = pg_proc_aclcheck(winagg->aggfnoid, GetUserId(),
									 ACL_EXECUTE);
		if (aclresult != ACLCHECK_OK)
			aclcheck_error(aclresult, ACL_KIND_PROC,
						   get_func_name(winagg->aggfnoid));

		peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn;
		peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;

		/*
		 * If final func is volatile, it is a window function, not aggregate.
		 */
		if (OidIsValid(finalfn_oid) && func_volatile(finalfn_oid) == PROVOLATILE_VOLATILE)
			peraggstate->finalfn_volatile = true;

		/* Check that aggregate owner has permission to call component fns */
		{
			HeapTuple	procTuple;
			Oid			aggOwner;

			procTuple = SearchSysCache(PROCOID,
									   ObjectIdGetDatum(winagg->aggfnoid),
									   0, 0, 0);
			if (!HeapTupleIsValid(procTuple))
				elog(ERROR, "cache lookup failed for function %u",
					 winagg->aggfnoid);
			aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
			ReleaseSysCache(procTuple);

			if (OidIsValid(transfn_oid))
			{
				aclresult = pg_proc_aclcheck(transfn_oid, aggOwner,
											 ACL_EXECUTE);
				if (aclresult != ACLCHECK_OK)
					aclcheck_error(aclresult, ACL_KIND_PROC,
								   get_func_name(transfn_oid));
			}

			if (OidIsValid(finalfn_oid))
			{
				aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
											 ACL_EXECUTE);
				if (aclresult != ACLCHECK_OK)
					aclcheck_error(aclresult, ACL_KIND_PROC,
								   get_func_name(finalfn_oid));
			}
		}

		/* resolve actual type of transition state, if polymorphic */
		aggtranstype = aggform->aggtranstype;
		if (IsPolymorphicType(aggtranstype))
		{
			/* have to fetch the agg's declared input types... */
			Oid		   *declaredArgTypes;
			int			agg_nargs;

			(void) get_func_signature(winagg->aggfnoid,
									  &declaredArgTypes, &agg_nargs);
			Assert(agg_nargs == numArguments);
			aggtranstype = enforce_generic_type_consistency(inputTypes,
															declaredArgTypes,
															agg_nargs,
															aggtranstype,
															false);
			pfree(declaredArgTypes);
		}

		/* build expression trees using actual argument & result types */
		build_aggregate_fnexprs(inputTypes,
								numArguments,
								aggtranstype,
								winagg->aggtype,
								transfn_oid,
								finalfn_oid,
								&transfnexpr,
								&finalfnexpr);

		if (OidIsValid(transfn_oid))
		{
			fmgr_info(transfn_oid, &peraggstate->transfn);
			peraggstate->transfn.fn_expr = (Node *) transfnexpr;
			/*
			 * If at least one of window functions is an aggregate,
			 * we must process iteration for aggregate. Otherwise,
			 * it can be passed.
			 */
			need_aggregate = true;
		}

		if (OidIsValid(finalfn_oid))
		{
			fmgr_info(finalfn_oid, &peraggstate->finalfn);
			peraggstate->finalfn.fn_expr = (Node *) finalfnexpr;
		}

		get_typlenbyval(winagg->aggtype,
						&peraggstate->resulttypeLen,
						&peraggstate->resulttypeByVal);
		get_typlenbyval(aggtranstype,
						&peraggstate->transtypeLen,
						&peraggstate->transtypeByVal);

		/*
		 * initval is potentially null, so don't try to access it as a struct
		 * field. Must do it the hard way with SysCacheGetAttr.
		 */
		textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
									  Anum_pg_aggregate_agginitval,
									  &peraggstate->initValueIsNull);

		if (peraggstate->initValueIsNull)
			peraggstate->initValue = (Datum) 0;
		else
			peraggstate->initValue = GetAggInitVal(textInitVal,
												   aggtranstype);

		/*
		 * If the transfn is strict and the initval is NULL, make sure input
		 * type and transtype are the same (or at least binary-compatible), so
		 * that it's OK to use the first input value as the initial
		 * transValue.	This should have been checked at agg definition time,
		 * but just in case...
		 */
		if (peraggstate->transfn.fn_strict && peraggstate->initValueIsNull)
		{
			if (numArguments < 1 ||
				!IsBinaryCoercible(inputTypes[0], aggtranstype))
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
						 errmsg("aggregate %u needs to have compatible input type and transition type",
								winagg->aggfnoid)));
		}

		ReleaseSysCache(aggTuple);
	}

	/* Update numaggs to match number of unique aggregates found */
	winstate->numaggs = aggno + 1;
	winstate->need_aggregate = need_aggregate;

	return winstate;
}

/* -----------------
 * ExecCountSlotsWindow 
 * -----------------
 */
int
ExecCountSlotsWindow(Window *node)
{
	return ExecCountSlotsNode(outerPlan(node)) + 
		ExecCountSlotsNode(innerPlan(node)) +
		WINDOW_NSLOTS; 
}
/* -----------------
 * ExecEndWindow 
 *
 *	TODO: comment
 * -----------------
 */
void 
ExecEndWindow(WindowState *node)
{
	PlanState  *outerPlan;
	
	/*
	 * Free both the expr contexts.
	 */
	ExecFreeExprContext(&node->ss.ps);
	node->ss.ps.ps_ExprContext = node->tmpcontext;
	ExecFreeExprContext(&node->ss.ps);

	ExecClearTuple(node->ss.ss_ScanTupleSlot);
	
	MemoryContextDelete(node->wincontext);
	
	if(node->ts_partition)
	{
		tuplestore_end((Tuplestorestate *) node->ts_partition);
		node->ts_partition = NULL;
	}
	
	outerPlan = outerPlanState(node);
	ExecEndNode(outerPlan);
}

/* -----------------
 * ExecRescanWindow 
 *
 *	TODO: comment
 * -----------------
 */
void 
ExecReScanWindow(WindowState *node, ExprContext *exprCtxt)
{
	ExprContext	   *econtext = node->ss.ps.ps_ExprContext;
	int				i;

	node->win_done = false;

	if (node->prt_firstTuple != NULL)
	{
		heap_freetuple(node->prt_firstTuple);
		node->prt_firstTuple = NULL;
	}

	MemSet(econtext->ecxt_winaggvalues, 0, sizeof(Datum) * node->numaggs);
	MemSet(econtext->ecxt_winaggnulls, 0, sizeof(bool) * node->numaggs);

	frame_finish(node);
	for(i = 0; i < node->numaggs; i++)
	{
		WindowStatePerAgg peraggstate = &node->peragg[i];
		
		if (!peraggstate->transtypeByVal &&
			!peraggstate->transValueIsNull &&
			DatumGetPointer(peraggstate->transValue) != NULL)
		{
			pfree(DatumGetPointer(peraggstate->transValue));
			peraggstate->transValueIsNull = true;
			peraggstate->noTransValue = true;
		}
	}

	MemoryContextResetAndDeleteChildren(node->wincontext);

	ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
}

/*
 * below are finals and related for window function.
 */
static WindowState *
getWindowState(FunctionCallInfo fcinfo)
{
	WindowState		   *winstate;

	if (!IsA(fcinfo->context, WindowState))
		elog(ERROR, "Window context is needed for this function");

	winstate = (WindowState *) fcinfo->context;
	
	return winstate;
}

#define allocate_if_new(fcinfo, size) do{ \
	if ((fcinfo)->flinfo->fn_extra == NULL) \
	{ \
		MemoryContext __oldContext; \
		__oldContext = MemoryContextSwitchTo((fcinfo)->flinfo->fn_mcxt); \
		(fcinfo)->flinfo->fn_extra = palloc0(size); \
		MemoryContextSwitchTo(__oldContext); \
	} \
}while(0)

static bool
rank_up(FunctionCallInfo fcinfo)
{
	WindowState		   *winstate;
	Window			   *node;
	TupleTableSlot	   *slot;
	TupleTableSlot	   *currentslot;
	MemoryContext		oldContext;
	rank_context	   *context;
	bool				up = false; /* should rank up? */

	allocate_if_new(fcinfo, sizeof(rank_context));
	
	context = (rank_context *) fcinfo->flinfo->fn_extra;
	winstate = getWindowState(fcinfo);
	node = (Window *) winstate->ss.ps.plan;
	currentslot = winstate->currentslot;
	
	if (context->heaptuple == NULL)
	{
		/* first call */
		oldContext = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt);
		context->heaptuple = ExecCopySlotTuple(currentslot);
		context->rank = context->rowcount = 1;
		MemoryContextSwitchTo(oldContext);
	}
	else
	{
		if (!node->ordNumCols)
			elog(ERROR, "this function requires ORDER BY clause in the window");
		slot = winstate->ss.ss_ScanTupleSlot;
		ExecStoreTuple(context->heaptuple, slot, InvalidBuffer, false);

		if (!execTuplesMatch(slot, currentslot,
							 node->ordNumCols, node->ordColIdx,
							 winstate->ordEqfunctions,
							 winstate->tmpcontext->ecxt_per_tuple_memory))
		{
			up = true;
			oldContext = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt);
			context->heaptuple = ExecCopySlotTuple(currentslot);
			MemoryContextSwitchTo(oldContext);
		}
		context->rowcount += 1;
	}

	return up;
}

/*
 * row_number
 * just increment up from 1 until current context finishes.
 */
Datum
row_number_final(PG_FUNCTION_ARGS)
{
	int64 *counter = NULL;
	
	allocate_if_new(fcinfo, sizeof(int64));

	counter = (int64 *) fcinfo->flinfo->fn_extra;
	*counter = *counter + 1;
	
	PG_RETURN_INT64(*counter);
}

/*
 * rank
 * increment up if key tuple changes. The new rank number is as the current row number.
 */
Datum 
rank_final(PG_FUNCTION_ARGS)
{
	rank_context	   *context;
	bool				up;

	up = rank_up(fcinfo);
	context = (rank_context *) fcinfo->flinfo->fn_extra;
	if (up)
	{
		context->rank = context->rowcount;
	}

	return Int64GetDatumFast(context->rank);
}

/*
 * increment up if key tuple changes. The new rank number is as added up 1.
 */
Datum 
dense_rank_final(PG_FUNCTION_ARGS)
{
	rank_context	   *context;
	bool				up;

	up = rank_up(fcinfo);
	context = (rank_context *) fcinfo->flinfo->fn_extra;
	if (up)
	{
		context->rank += 1;
	}

	PG_RETURN_INT64(context->rank);
}

/*
 * percent_rank returns fraction between 0 and 1 inclusive, which 
 * is described as (RK - 1) / (NR - 1), where RK is the rank and NR is
 * the number of total row.
 */
Datum
percent_rank_final(PG_FUNCTION_ARGS)
{
	rank_context	   *context;
	bool				up;
	int64				total_rows;

	up = rank_up(fcinfo);
	context = (rank_context *) fcinfo->flinfo->fn_extra;
	if (up)
	{
		context->rank = context->rowcount;
	}

	total_rows = PG_GETARG_INT64(0);
	if(total_rows == 1)
		PG_RETURN_FLOAT8(1.0);

	PG_RETURN_FLOAT8((float8) (context->rank - 1) / (float8) (total_rows - 1));
}

/*
 * cume_dist
 * return fraction betweeen 0 and 1 inclusive, which 
 * is described as NP / NR, where NP is the number of row preceeding and
 * NR is the number of total row.
 */
Datum
cume_dist_final(PG_FUNCTION_ARGS)
{
	rank_context	   *context;
	bool				up;
	int64				total_rows;

	up = rank_up(fcinfo);
	context = (rank_context *) fcinfo->flinfo->fn_extra;

	total_rows = PG_GETARG_INT64(0);

	PG_RETURN_FLOAT8((float8) context->rowcount / (float8) total_rows);
}

/*
 * ntile(integer) divide each row by bucket.
 * Since if total row count is not divisible by the argument integer
 * it adds one row to the leading buckets, number of difference between
 * maximum bucket's row and minimum bucket's row must be one or zero.
 */
Datum 
ntile_trans(PG_FUNCTION_ARGS)
{
	ArrayType  *data;
	int64	   *elements;
	int64		nbuckets;

	data = PG_GETARG_ARRAYTYPE_P(0);
	elements = (int64 *) ARR_DATA_PTR(data);
	elements[0] += 1;
	if (elements[1] == 0)
	{
		nbuckets = PG_GETARG_INT64(1);
		if (nbuckets <= 0)
			elog(ERROR, "negative or zero bucket size detected");
		elements[1] = nbuckets;
	}

	PG_RETURN_ARRAYTYPE_P(data);
}

Datum
ntile_final(PG_FUNCTION_ARGS)
{
	ntile_context *context;

	allocate_if_new(fcinfo, sizeof(ntile_context));

	context = (ntile_context *) fcinfo->flinfo->fn_extra;

	if (context->ntile == 0)
	{
		/* first call */
		ArrayType  *data;
		int64	   *elements;
		int64		total;
		int64		nbuckets;

		data = PG_GETARG_ARRAYTYPE_P(0);
		elements = (int64 *) ARR_DATA_PTR(data);
		total = elements[0];
		nbuckets = elements[1];

		context->ntile = 1;
		context->nrows = 0;
		context->boundary = total / nbuckets;
		if (context->boundary <= 0)
			context->boundary = 1;
		else
		{
			/*
			 * If the total number is not divisible, add 1 row to 
			 * leading buckets.
			 */
			context->remainder = total % nbuckets;
			if (context->remainder != 0)
				context->boundary += 1;
		}
	}

	context->nrows += 1;
	if (context->boundary < context->nrows)
	{
		if (context->remainder != 0 && context->ntile == context->remainder)
		{
			context->remainder = 0;
			context->boundary -= 1;
		}
		context->ntile += 1;
		context->nrows = 1;
	}
	
	PG_RETURN_INT64(context->ntile);
}
