From f2d3149b370a27d812ad7f830fdd1e1e9a5088d3 Mon Sep 17 00:00:00 2001
From: amit <amitlangote09@gmail.com>
Date: Fri, 18 Mar 2016 19:48:49 +0900
Subject: [PATCH 6/6] Introduce tuple routing for partitioned tables.

A tuple inserted into a partitioned table using insert or copy is now
routed to the appropriate leaf partition, recursively.

State required for the same is built in advance by BeginCopy() and
ExecInitModifyTable().  It consists of N ResultRelInfos and pointers
to TupleConversionMap structs, where N is the number of leaf partitions
in the multi-level partition tree. One needs only to get a pointer to the
root PartitionDescNode node for the tree. A tuple conversion map is only
allocated if the tuple descriptor of a leaf partition differs from that
of the root table.  Partition check expression is not initialized for
leaf partition result relations, as they are redundant in this case.
Whether or not to build one is now determined by new parameter
load_partition_check of InitResultRelInfo().

CopyFrom() or ExecInsert() passes the root node (set up initially, as
just mentioned) along with the tuple slot to get_partition_for_tuple()
which traverses the tree to determine the appropriate leaf partition for
the tuple.  It returns the index (0..N) of the leaf partition.  Caller
then switches the result relation (ResultRelInfo) appropriately and
performs the tuple conversion, if required, before following up with
the row triggers, constraints, actual heap and index insertion.

* Notes about foreign partitions:
Foreign partitions require special consideration as a suitable insert plan
needs to be created in advance.  So, make_modifytable() invokes
PlanForeignModify() for every leaf partition that is a foreign table.
Resulting plans are stored in ModifyTable.fdwPrivLists.  Subsequently,
ExecInitModifyTable() initializes ForeignModifys for each FDW partition
using the information in fdwPrivLists.  Similarly, BeginCopy() sets up
CopyState.partition_fdw_priv_lists (process in this case is more dramatic
than in ModifyTable's) and later CopyFrom() initializes ForeignModifys.

* Following requires further consideration:
Both CopyFrom() and ExecInsert() switch to the partition before invoking
BR triggers and back after AR trigger are processed. That ignores parent's
triggers completely.  Note that those triggers are not required to be present
in each partition.  OTOH, constraints are fine (both check and not null),
since parent's constraints are present in partitions.
---
 src/backend/catalog/partition.c         |  352 +++++++++++++++++++++++++++++++
 src/backend/commands/copy.c             |  198 +++++++++++++++++-
 src/backend/commands/tablecmds.c        |    1 +
 src/backend/executor/execMain.c         |   46 ++++-
 src/backend/executor/nodeModifyTable.c  |  116 ++++++++++
 src/backend/optimizer/plan/createplan.c |   59 +++++
 src/backend/optimizer/util/plancat.c    |   13 ++
 src/backend/parser/analyze.c            |    8 +
 src/include/catalog/partition.h         |   25 +++
 src/include/executor/executor.h         |    6 +
 src/include/nodes/execnodes.h           |   10 +
 src/include/optimizer/plancat.h         |    1 +
 12 files changed, 828 insertions(+), 7 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 3404823..69fc362 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -96,6 +96,21 @@ static List *generate_partition_check_qual(Relation rel);
 static PartitionDescNode GetPartitionDescNodeRecurse(Relation rel, int offset);
 static int get_leaf_partition_count(PartitionDescNode pdnode);
 
+static PartitionKeyExecInfo *BuildPartitionKeyExecInfo(Relation rel);
+static void FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo,
+							TupleTableSlot *slot,
+							EState *estate,
+							Datum *values,
+							bool *isnull);
+static int list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc,
+							Datum value, bool isnull);
+static int range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc,
+							Datum *values);
+static int range_partition_bsearch(PartitionKey key, PartitionDesc pdesc,
+						Datum *values);
+static bool rightof(PartitionKey key, Datum *values, PartitionRangeBound *bound);
+static bool leftof(PartitionKey key, Datum *values, PartitionRangeBound *bound);
+
 /*
  * StorePartitionKey
  *		Store the partition keys of rel into pg_partitioned catalog
@@ -978,6 +993,34 @@ get_partition_parent(Oid relid)
 }
 
 /*
+ * get_leaf_partitions
+ *		Returns a list of all leaf-level partitions of relation with OID
+ *		'relid'.
+ */
+List *
+get_leaf_partitions(Oid relid, int lockmode)
+{
+	List	   *partitions,
+			   *result = NIL;
+	ListCell   *lc;
+
+	partitions = get_partitions(relid, lockmode);
+
+	foreach(lc, partitions)
+	{
+		Oid		myoid = lfirst_oid(lc);
+
+		if (relid_is_partitioned(myoid))
+			result = list_concat(result,
+								 get_leaf_partitions(myoid, lockmode));
+
+		result = lappend_oid(result, myoid);
+	}
+
+	return result;
+}
+
+/*
  * RelationDropPartitions
  *		Find and drop all partitions of relation with OID 'relid'
  */
@@ -1699,6 +1742,7 @@ GetPartitionDescNodeRecurse(Relation rel, int offset)
 
 	/* First build our own node */
 	root = (PartitionDescNode) palloc0(sizeof(PartitionDescNodeData));
+	root->pkinfo = BuildPartitionKeyExecInfo(rel);
 	root->pdesc = pdesc;
 	root->relid = RelationGetRelid(rel);
 	root->offset = offset;
@@ -1816,3 +1860,311 @@ get_leaf_partition_oids(PartitionDescNode pdnode)
 
 	return result;
 }
+
+/* ----------------
+ *	BuildPartitionKeyExecInfo
+ *		Construct a list of PartitionKeyExecInfo records for an open
+ *		relation
+ *
+ * PartitionKeyExecInfo stores the information about the partition key
+ * that's needed when inserting tuples into a partitioned table; especially,
+ * partition key expression state if there are any expression columns in
+ * the partition key.  Normally we build a PartitionKeyExecInfo for a
+ * partitioned table just once per command, and then use it for (potentially)
+ * many tuples.
+ * ----------------
+ */
+static PartitionKeyExecInfo *
+BuildPartitionKeyExecInfo(Relation rel)
+{
+	PartitionKeyExecInfo   *pkinfo;
+
+	pkinfo = (PartitionKeyExecInfo *) palloc0(sizeof(PartitionKeyExecInfo));
+	pkinfo->pi_Key = CopyPartitionKey(rel->rd_partkey);
+	pkinfo->pi_ExpressionState = NIL;
+
+	return pkinfo;
+}
+
+/*
+ * get_partition_for_tuple
+ *		Recursively finds the "leaf" partition for tuple (slot)
+ *
+ * Returns -1 if no partition is found and sets *failed_at to the OID of
+ * the partitioned table whose partition was not found.
+ */
+int
+get_partition_for_tuple(PartitionDescNode pdnode,
+						TupleTableSlot *slot,
+						EState *estate,
+						Oid *failed_at)
+{
+	PartitionKeyExecInfo   *pkinfo = pdnode->pkinfo;
+	PartitionDescNode		node;
+	Datum	values[PARTITION_MAX_KEYS];
+	bool	isnull[PARTITION_MAX_KEYS];
+	int		i;
+	int		index;
+
+	/* Guard against stack overflow due to overly deep partition tree */
+	check_stack_depth();
+
+	if (pdnode->pdesc->nparts == 0)
+	{
+		*failed_at = pdnode->relid;
+		return -1;
+	}
+
+	/* Extract partition key from tuple */
+	Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
+	FormPartitionKeyDatum(pkinfo, slot, estate, values, isnull);
+
+	/* Disallow nulls, if range partition key */
+	for (i = 0; i < pkinfo->pi_Key->partnatts; i++)
+		if (isnull[i] && pkinfo->pi_Key->strategy == PARTITION_STRAT_RANGE)
+			ereport(ERROR,
+					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+					 errmsg("range partition key contains null")));
+
+	switch (pkinfo->pi_Key->strategy)
+	{
+		case PARTITION_STRAT_LIST:
+			index = list_partition_for_tuple(pkinfo->pi_Key, pdnode->pdesc,
+											 values[0], isnull[0]);
+			break;
+
+		case PARTITION_STRAT_RANGE:
+			index = range_partition_for_tuple(pkinfo->pi_Key, pdnode->pdesc,
+											  values);
+			break;
+	}
+
+	/* Don't recurse, if no partition exists for the key at this level... */
+	if (index < 0)
+	{
+		*failed_at = pdnode->relid;
+		return index;
+	}
+
+	/* ... or, if the index'th partition is a leaf partition. */
+	if (!relid_is_partitioned(pdnode->pdesc->oids[index]))
+	{
+		PartitionDescNode	prev;
+
+		/* Patiently determine own leaf partition index */
+		prev = node = pdnode->downlink;
+		if (node && node->index < index)
+		{
+			while (node)
+			{
+				if (node->index > index)
+				{
+					node = prev;
+					break;
+				}
+
+				prev = node;
+				node = node->next;
+			}
+
+			if (!node)
+				node = prev;
+
+			/*
+			 * Account for siblings on left that are partition trees and those
+			 * that are leaf partitions themsleves.
+			 */
+			return node->offset + node->num_leaf_partitions +
+										(index - node->index - 1);
+		}
+		else
+			/* Only leaf partitions in pdnode->offset..index */
+			return pdnode->offset + index;
+
+	}
+
+	/* Recurse, by locating the index'th partition's node */
+	node = pdnode->downlink;
+	while (node->next != NULL && node->index != index)
+		node = node->next;
+	Assert (node != NULL);
+
+	return get_partition_for_tuple(node, slot, estate, failed_at);
+}
+
+/*
+ * FormPartitionKeyDatum
+ *		Construct values[] and isnull[] arrays for partition key columns
+ */
+static void
+FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo,
+					  TupleTableSlot *slot,
+					  EState *estate,
+					  Datum *values,
+					  bool *isnull)
+{
+	ListCell   *partexpr_item;
+	int			i;
+
+	if (pkinfo->pi_Key->partexprs != NIL && pkinfo->pi_ExpressionState == NIL)
+	{
+		/* First time through, set up expression evaluation state */
+		pkinfo->pi_ExpressionState = (List *)
+			ExecPrepareExpr((Expr *) pkinfo->pi_Key->partexprs,
+							estate);
+		/* Check caller has set up context correctly */
+		Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
+	}
+
+	partexpr_item = list_head(pkinfo->pi_ExpressionState);
+	for (i = 0; i < pkinfo->pi_Key->partnatts; i++)
+	{
+		AttrNumber	keycol = pkinfo->pi_Key->partattrs[i];
+		Datum		pkDatum;
+		bool		isNull;
+
+		if (keycol != 0)
+		{
+			/* Plain column; get the value directly from the heap tuple */
+			pkDatum = slot_getattr(slot, keycol, &isNull);
+		}
+		else
+		{
+			/* Expression; need to evaluate it */
+			if (partexpr_item == NULL)
+				elog(ERROR, "wrong number of partition key expressions");
+			pkDatum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
+											   GetPerTupleExprContext(estate),
+											   &isNull,
+											   NULL);
+			partexpr_item = lnext(partexpr_item);
+		}
+		values[i] = pkDatum;
+		isnull[i] = isNull;
+	}
+
+	if (partexpr_item != NULL)
+		elog(ERROR, "wrong number of partition key expressions");
+}
+
+/*
+ * list_partition_for_tuple
+ *		Find the list partition for a tuple
+ *
+ * Returns -1 if none found.
+ */
+static int
+list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc,
+						 Datum value, bool isnull)
+{
+	int			i;
+
+	Assert(pdesc->nparts > 0);
+
+	for (i = 0; i < pdesc->nparts; i++)
+	{
+		int		j;
+
+		if (isnull)
+		{
+			if (pdesc->lists[i]->contains_null)
+				return i;
+
+			continue;
+		}
+
+		for (j = 0; j < pdesc->lists[i]->nvalues; j++)
+		{
+			int32	cmpval;
+
+			cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
+								   key->tcinfo->typcoll[0],
+								   pdesc->lists[i]->values[j],
+								   value));
+			if (!cmpval)
+				return i;
+		}
+	}
+
+	return -1;
+}
+
+/*
+ * range_partition_for_tuple
+ *		Search the range partition for a range key ('values')
+ *
+ * Returns -1 if none found.
+ */
+static int
+range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, Datum *values)
+{
+	Assert(pdesc->nparts > 0);
+
+	return range_partition_bsearch(key, pdesc, values);
+}
+
+/*
+ * range_partition_bsearch
+ *		Workhorse of range_partition_for_tuple
+ */
+static int
+range_partition_bsearch(PartitionKey key, PartitionDesc pdesc,
+						Datum *values)
+{
+	int		low, high;
+
+	/* Good ol' bsearch */
+	low = 0;
+	high = pdesc->nparts - 1;
+	while (low <= high)
+	{
+		int		idx = (low + high) / 2;
+
+		if (pdesc->rangeuppers[idx]->infinite)
+		{
+			if (rightof(key, values, pdesc->rangelowers[idx]))
+				return idx;
+
+			break;
+		}
+		else if (leftof(key, values, pdesc->rangeuppers[idx]))
+		{
+			if (pdesc->rangelowers[idx]->infinite)
+				return idx;
+
+			if (rightof(key, values, pdesc->rangelowers[idx]))
+				return idx;
+
+			high = idx - 1;
+			continue;
+		}
+
+		low = idx + 1;
+	}
+
+	return -1;
+}
+
+/* Does range key lie to the right of partition bound */
+static bool
+rightof(PartitionKey key, Datum *values, PartitionRangeBound *bound)
+{
+	int32	cmpval = compare_range_keys(key, values, bound->val);
+
+	if (!cmpval)
+		return bound->lower ? bound->inclusive : !bound->inclusive;
+
+	return cmpval > 0;
+}
+
+/* Does range key lie to the left of partition bound */
+static bool
+leftof(PartitionKey key, Datum *values, PartitionRangeBound *bound)
+{
+	int32	cmpval = compare_range_keys(key, values, bound->val);
+
+	if (!cmpval)
+		return !bound->lower ? bound->inclusive : !bound->inclusive;
+
+	return cmpval < 0;
+}
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 76803a5..0e2c61a 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -30,6 +30,7 @@
 #include "commands/defrem.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
+#include "foreign/fdwapi.h"
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
 #include "mb/pg_wchar.h"
@@ -161,6 +162,11 @@ typedef struct CopyStateData
 	ExprState **defexprs;		/* array of default att expressions */
 	bool		volatile_defexprs;		/* is any of defexprs volatile? */
 	List	   *range_table;
+	PartitionDescNode		pdnode;	/* partition descriptor node tree */
+	ResultRelInfo		   *partitions;
+	TupleConversionMap	  **partition_tupconv_maps;
+	List				   *partition_fdw_priv_lists;
+	int						num_partitions;
 
 	/*
 	 * These variables are used to reduce overhead in textual COPY FROM.
@@ -1364,6 +1370,87 @@ BeginCopy(bool is_from,
 					(errcode(ERRCODE_UNDEFINED_COLUMN),
 					 errmsg("table \"%s\" does not have OIDs",
 							RelationGetRelationName(cstate->rel))));
+
+		/*
+		 * Initialize state for CopyFrom tuple routing.  Watch out for
+		 * any foreign partitions.
+		 */
+		if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_REL)
+		{
+			List		   *leaf_part_oids;
+			ListCell	   *cell;
+			int				i;
+			int				num_leaf_parts;
+			ResultRelInfo  *leaf_rel_rri;
+			PlannerInfo *root = makeNode(PlannerInfo);	/* mostly dummy */
+			Query		*parse = makeNode(Query);		/* ditto */
+			ModifyTable *plan = makeNode(ModifyTable);	/* ditto */
+			RangeTblEntry *fdw_rte = makeNode(RangeTblEntry);	/* ditto */
+			List		*fdw_private_lists = NIL;
+
+			cstate->pdnode = RelationGetPartitionDescNode(rel);
+			leaf_part_oids = get_leaf_partition_oids(cstate->pdnode);
+			num_leaf_parts = list_length(leaf_part_oids);
+
+			cstate->num_partitions = num_leaf_parts;
+			cstate->partitions = (ResultRelInfo *)
+								palloc0(num_leaf_parts * sizeof(ResultRelInfo));
+			cstate->partition_tupconv_maps = (TupleConversionMap **)
+						palloc0(num_leaf_parts * sizeof(TupleConversionMap *));
+
+			/* For use below, iff a partition found to be a foreign table */
+			plan->operation = CMD_INSERT;
+			plan->plans = list_make1(makeNode(Result));
+			fdw_rte->rtekind = RTE_RELATION;
+			fdw_rte->relkind = RELKIND_FOREIGN_TABLE;
+			parse->rtable = list_make1(fdw_rte);
+			root->parse = parse;
+
+			leaf_rel_rri = cstate->partitions;
+			i = 0;
+			foreach(cell, leaf_part_oids)
+			{
+				Relation	leaf_rel;
+
+				leaf_rel = heap_open(lfirst_oid(cell), RowExclusiveLock);
+				InitResultRelInfo(leaf_rel_rri,
+								  leaf_rel,
+								  1,		/* dummy */
+								  false,	/* no need for partition check */
+								  0);
+
+				/* Open partition indices */
+				ExecOpenIndices(leaf_rel_rri, false);
+
+				/* Special dance for foreign tables */
+				if (leaf_rel_rri->ri_FdwRoutine)
+				{
+					List		  *fdw_private;
+
+					fdw_rte->relid = RelationGetRelid(leaf_rel);
+					fdw_private = leaf_rel_rri->ri_FdwRoutine->PlanForeignModify(root,
+																		  plan,
+																		  1,
+																		  0);
+					fdw_private_lists = lappend(fdw_private_lists, fdw_private);
+				}
+
+				if (!equalTupleDescs(tupDesc, RelationGetDescr(leaf_rel)))
+					cstate->partition_tupconv_maps[i] =
+								convert_tuples_by_name(tupDesc,
+									RelationGetDescr(leaf_rel),
+									gettext_noop("could not convert row type"));
+
+				leaf_rel_rri++;
+				i++;
+			}
+
+			cstate->partition_fdw_priv_lists = fdw_private_lists;
+			pfree(fdw_rte);
+			pfree(plan);
+			pfree(parse);
+			pfree(root);
+		}
 	}
 	else
 	{
@@ -1659,6 +1746,8 @@ ClosePipeToProgram(CopyState cstate)
 static void
 EndCopy(CopyState cstate)
 {
+	int		i;
+
 	if (cstate->is_program)
 	{
 		ClosePipeToProgram(cstate);
@@ -1672,6 +1761,23 @@ EndCopy(CopyState cstate)
 							cstate->filename)));
 	}
 
+	/* Close all partitions and indices thereof */
+	for (i = 0; i < cstate->num_partitions; i++)
+	{
+		ResultRelInfo *resultRelInfo = cstate->partitions + i;
+
+		ExecCloseIndices(resultRelInfo);
+		heap_close(resultRelInfo->ri_RelationDesc, NoLock);
+
+		/* XXX - EState not handy here to pass to EndForeignModify() */
+		if (resultRelInfo->ri_FdwRoutine &&
+			resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL)
+			resultRelInfo->ri_FdwRoutine->EndForeignModify(NULL, resultRelInfo);
+
+		if (cstate->partition_tupconv_maps[i])
+			pfree(cstate->partition_tupconv_maps[i]);
+	}
+
 	MemoryContextDelete(cstate->copycontext);
 	pfree(cstate);
 }
@@ -2216,6 +2322,7 @@ CopyFrom(CopyState cstate)
 	Datum	   *values;
 	bool	   *nulls;
 	ResultRelInfo *resultRelInfo;
+	ResultRelInfo *saved_resultRelInfo = NULL;
 	EState	   *estate = CreateExecutorState(); /* for ExecConstraints() */
 	ExprContext *econtext;
 	TupleTableSlot *myslot;
@@ -2236,7 +2343,8 @@ CopyFrom(CopyState cstate)
 
 	Assert(cstate->rel);
 
-	if (cstate->rel->rd_rel->relkind != RELKIND_RELATION)
+	if (cstate->rel->rd_rel->relkind != RELKIND_RELATION &&
+		cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_REL)
 	{
 		if (cstate->rel->rd_rel->relkind == RELKIND_VIEW)
 			ereport(ERROR,
@@ -2344,6 +2452,7 @@ CopyFrom(CopyState cstate)
 	InitResultRelInfo(resultRelInfo,
 					  cstate->rel,
 					  1,		/* dummy rangetable index */
+					  true,		/* do load partition check expression */
 					  0);
 
 	ExecOpenIndices(resultRelInfo, false);
@@ -2371,6 +2480,7 @@ CopyFrom(CopyState cstate)
 	if ((resultRelInfo->ri_TrigDesc != NULL &&
 		 (resultRelInfo->ri_TrigDesc->trig_insert_before_row ||
 		  resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) ||
+		cstate->pdnode != NULL ||
 		cstate->volatile_defexprs)
 	{
 		useHeapMultiInsert = false;
@@ -2392,10 +2502,46 @@ CopyFrom(CopyState cstate)
 	 */
 	ExecBSInsertTriggers(estate, resultRelInfo);
 
+	/* Initialize FDW partition insert plans */
+	if (cstate->pdnode)
+	{
+		int			i,
+					j;
+		List	   *fdw_private_lists = cstate->partition_fdw_priv_lists;
+		ModifyTableState   *mtstate = makeNode(ModifyTableState);
+		ResultRelInfo	   *leaf_part_rri;
+
+		/* Mostly dummy containing enough state for BeginForeignModify */
+		mtstate->ps.state = estate;
+		mtstate->operation = CMD_INSERT;
+
+		j = 0;
+		leaf_part_rri = cstate->partitions;
+		for (i = 0; i < cstate->num_partitions; i++)
+		{
+			if (leaf_part_rri->ri_FdwRoutine)
+			{
+				List *fdw_private;
+
+				Assert(fdw_private_lists);
+				fdw_private = list_nth(fdw_private_lists, j++);
+				leaf_part_rri->ri_FdwRoutine->BeginForeignModify(mtstate,
+															leaf_part_rri,
+															fdw_private,
+															0, 0);
+			}
+			leaf_part_rri++;
+		}
+	}
+
 	values = (Datum *) palloc(tupDesc->natts * sizeof(Datum));
 	nulls = (bool *) palloc(tupDesc->natts * sizeof(bool));
 
-	bistate = GetBulkInsertState();
+	if (useHeapMultiInsert)
+		bistate = GetBulkInsertState();
+	else
+		bistate = NULL;
+
 	econtext = GetPerTupleExprContext(estate);
 
 	/* Set up callback to identify error line number */
@@ -2447,6 +2593,31 @@ CopyFrom(CopyState cstate)
 		slot = myslot;
 		ExecStoreTuple(tuple, slot, InvalidBuffer, false);
 
+		/* Determine the partition */
+		saved_resultRelInfo = resultRelInfo;
+		if (cstate->pdnode)
+		{
+			int		i_leaf_partition;
+			TupleConversionMap *map;
+
+			econtext->ecxt_scantuple = slot;
+			i_leaf_partition = ExecFindPartition(resultRelInfo,
+												 cstate->pdnode,
+												 slot,
+												 estate);
+			Assert(i_leaf_partition >= 0 &&
+				   i_leaf_partition < cstate->num_partitions);
+
+			resultRelInfo = cstate->partitions + i_leaf_partition;
+			estate->es_result_relation_info = resultRelInfo;
+
+			map = cstate->partition_tupconv_maps[i_leaf_partition];
+			if (map)
+				tuple = do_convert_tuple(tuple, map);
+
+			tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+		}
+
 		skip_tuple = false;
 
 		/* BEFORE ROW INSERT Triggers */
@@ -2467,7 +2638,16 @@ CopyFrom(CopyState cstate)
 			if (cstate->rel->rd_att->constr || resultRelInfo->ri_PartitionCheck)
 				ExecConstraints(resultRelInfo, slot, estate);
 
-			if (useHeapMultiInsert)
+			if (resultRelInfo->ri_FdwRoutine)
+			{
+				resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate,
+																resultRelInfo,
+																slot,
+																NULL);
+				/* AFTER ROW INSERT Triggers */
+				ExecARInsertTriggers(estate, resultRelInfo, tuple, NIL);
+			}
+			else if (useHeapMultiInsert)
 			{
 				/* Add this tuple to the tuple buffer */
 				if (nBufferedTuples == 0)
@@ -2497,7 +2677,8 @@ CopyFrom(CopyState cstate)
 				List	   *recheckIndexes = NIL;
 
 				/* OK, store the tuple and create index entries for it */
-				heap_insert(cstate->rel, tuple, mycid, hi_options, bistate);
+				heap_insert(resultRelInfo->ri_RelationDesc,
+							tuple, mycid, hi_options, bistate);
 
 				if (resultRelInfo->ri_NumIndices > 0)
 					recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self),
@@ -2517,6 +2698,12 @@ CopyFrom(CopyState cstate)
 			 * tuples inserted by an INSERT command.
 			 */
 			processed++;
+
+			if (saved_resultRelInfo)
+			{
+				resultRelInfo = saved_resultRelInfo;
+				estate->es_result_relation_info = resultRelInfo;
+			}
 		}
 	}
 
@@ -2530,7 +2717,8 @@ CopyFrom(CopyState cstate)
 	/* Done, clean up */
 	error_context_stack = errcallback.previous;
 
-	FreeBulkInsertState(bistate);
+	if (bistate)
+		FreeBulkInsertState(bistate);
 
 	MemoryContextSwitchTo(oldcontext);
 
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 5b83b8d..5911659 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -1237,6 +1237,7 @@ ExecuteTruncate(TruncateStmt *stmt)
 		InitResultRelInfo(resultRelInfo,
 						  rel,
 						  0,	/* dummy rangetable index */
+						  false,
 						  0);
 		resultRelInfo++;
 	}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index bbd2187..0d86e14 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -826,6 +826,7 @@ InitPlan(QueryDesc *queryDesc, int eflags)
 			InitResultRelInfo(resultRelInfo,
 							  resultRelation,
 							  resultRelationIndex,
+							  true,
 							  estate->es_instrument);
 			resultRelInfo++;
 		}
@@ -1215,6 +1216,7 @@ void
 InitResultRelInfo(ResultRelInfo *resultRelInfo,
 				  Relation resultRelationDesc,
 				  Index resultRelationIndex,
+				  bool load_partition_check,
 				  int instrument_options)
 {
 	MemSet(resultRelInfo, 0, sizeof(ResultRelInfo));
@@ -1252,8 +1254,9 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 	resultRelInfo->ri_ConstraintExprs = NULL;
 	resultRelInfo->ri_junkFilter = NULL;
 	resultRelInfo->ri_projectReturning = NULL;
-	resultRelInfo->ri_PartitionCheck = (Expr *)
-						RelationGetPartitionCheckQual(resultRelationDesc);
+	if (load_partition_check)
+		resultRelInfo->ri_PartitionCheck = (Expr *)
+							RelationGetPartitionCheckQual(resultRelationDesc);
 }
 
 /*
@@ -1316,6 +1319,7 @@ ExecGetTriggerResultRel(EState *estate, Oid relid)
 	InitResultRelInfo(rInfo,
 					  rel,
 					  0,		/* dummy rangetable index */
+					  true,
 					  estate->es_instrument);
 	estate->es_trig_target_relations =
 		lappend(estate->es_trig_target_relations, rInfo);
@@ -2996,3 +3000,41 @@ EvalPlanQualEnd(EPQState *epqstate)
 	epqstate->planstate = NULL;
 	epqstate->origslot = NULL;
 }
+
+int
+ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDescNode pdnode,
+				  TupleTableSlot *slot, EState *estate)
+{
+	int		i_leaf_partition;
+	Oid		failed_at;
+
+	i_leaf_partition = get_partition_for_tuple(pdnode, slot, estate,
+											   &failed_at);
+
+	if (i_leaf_partition < 0)
+	{
+		Relation	rel = resultRelInfo->ri_RelationDesc;
+		char	   *val_desc;
+		Bitmapset  *insertedCols,
+				   *updatedCols,
+				   *modifiedCols;
+		TupleDesc	tupDesc = RelationGetDescr(rel);
+
+		insertedCols = GetInsertedColumns(resultRelInfo, estate);
+		updatedCols = GetUpdatedColumns(resultRelInfo, estate);
+		modifiedCols = bms_union(insertedCols, updatedCols);
+		val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+												 slot,
+												 tupDesc,
+												 modifiedCols,
+												 64);
+		Assert(OidIsValid(failed_at));
+		ereport(ERROR,
+				(errcode(ERRCODE_CHECK_VIOLATION),
+				 errmsg("no partition of relation \"%s\" found for row",
+						get_rel_name(failed_at)),
+		  val_desc ? errdetail("Failing row contains %s.", val_desc) : 0));
+	}
+
+	return i_leaf_partition;
+}
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 3ed321e..7fc2c19 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -243,6 +243,7 @@ ExecInsert(ModifyTableState *mtstate,
 {
 	HeapTuple	tuple;
 	ResultRelInfo *resultRelInfo;
+	ResultRelInfo *saved_resultRelInfo = NULL;
 	Relation	resultRelationDesc;
 	Oid			newId;
 	List	   *recheckIndexes = NIL;
@@ -257,6 +258,31 @@ ExecInsert(ModifyTableState *mtstate,
 	 * get information on the (current) result relation
 	 */
 	resultRelInfo = estate->es_result_relation_info;
+
+	saved_resultRelInfo = resultRelInfo;
+
+	if (mtstate->mt_partition_node)
+	{
+		int		i_leaf_partition;
+		ExprContext *econtext = GetPerTupleExprContext(estate);
+		TupleConversionMap *map;
+
+		econtext->ecxt_scantuple = slot;
+		i_leaf_partition = ExecFindPartition(resultRelInfo,
+											 mtstate->mt_partition_node,
+											 slot,
+											 estate);
+		Assert(i_leaf_partition >= 0 &&
+			   i_leaf_partition < mtstate->mt_num_partitions);
+
+		resultRelInfo = mtstate->mt_partitions + i_leaf_partition;
+		estate->es_result_relation_info = resultRelInfo;
+
+		map = mtstate->mt_partition_tupconv_maps[i_leaf_partition];
+		if (map)
+			tuple = do_convert_tuple(tuple, map);
+	}
+
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
 
 	/*
@@ -496,6 +522,12 @@ ExecInsert(ModifyTableState *mtstate,
 
 	list_free(recheckIndexes);
 
+	if (saved_resultRelInfo)
+	{
+		resultRelInfo = saved_resultRelInfo;
+		estate->es_result_relation_info = resultRelInfo;
+	}
+
 	/*
 	 * Check any WITH CHECK OPTION constraints from parent views.  We are
 	 * required to do this after testing all constraints and uniqueness
@@ -1550,6 +1582,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	Plan	   *subplan;
 	ListCell   *l;
 	int			i;
+	Relation	rel;
 
 	/* check for unsupported flags */
 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -1640,6 +1673,72 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	estate->es_result_relation_info = saved_resultRelInfo;
 
+	/* Build state for INSERT tuple routing */
+	rel = mtstate->resultRelInfo->ri_RelationDesc;
+	if (operation == CMD_INSERT &&
+		rel->rd_rel->relkind == RELKIND_PARTITIONED_REL)
+	{
+		int					i,
+							j,
+							num_leaf_parts;
+		List			   *leaf_part_oids;
+		ListCell		   *cell;
+		ResultRelInfo	   *leaf_rel_rri;
+
+		mtstate->mt_partition_node = RelationGetPartitionDescNode(rel);
+		leaf_part_oids = get_leaf_partition_oids(mtstate->mt_partition_node);
+		num_leaf_parts = list_length(leaf_part_oids);
+
+		mtstate->mt_num_partitions = num_leaf_parts;
+		mtstate->mt_partitions = (ResultRelInfo *)
+						palloc0(num_leaf_parts * sizeof(ResultRelInfo));
+		mtstate->mt_partition_tupconv_maps = (TupleConversionMap **)
+					palloc0(num_leaf_parts * sizeof(TupleConversionMap *));
+
+		leaf_rel_rri = mtstate->mt_partitions;
+		i = j = 0;
+		foreach(cell, leaf_part_oids)
+		{
+			Relation	leaf_rel;
+
+			leaf_rel = heap_open(lfirst_oid(cell), RowExclusiveLock);
+			InitResultRelInfo(leaf_rel_rri,
+							  leaf_rel,
+							  1,		/* dummy */
+							  false,	/* no need for partition checks */
+							  eflags);
+
+			/* Open partition indices (note: ON CONFLICT unsupported)*/
+			if (leaf_rel_rri->ri_RelationDesc->rd_rel->relhasindex &&
+				operation != CMD_DELETE &&
+				leaf_rel_rri->ri_IndexRelationDescs == NULL)
+				ExecOpenIndices(leaf_rel_rri, false);
+
+			if (leaf_rel_rri->ri_FdwRoutine)
+			{
+				/* As many fdw_private's in fdwPrivLists as FDW partitions */
+				List *fdw_private = (List *) list_nth(node->fdwPrivLists, j);
+
+				leaf_rel_rri->ri_FdwRoutine->BeginForeignModify(mtstate,
+																leaf_rel_rri,
+																fdw_private,
+																0,
+																eflags);
+				j++;
+			}
+
+			if (!equalTupleDescs(RelationGetDescr(rel),
+								 RelationGetDescr(leaf_rel)))
+				mtstate->mt_partition_tupconv_maps[i] =
+							convert_tuples_by_name(RelationGetDescr(rel),
+												   RelationGetDescr(leaf_rel),
+								  gettext_noop("could not convert row type"));
+
+			leaf_rel_rri++;
+			i++;
+		}
+	}
+
 	/*
 	 * Initialize any WITH CHECK OPTION constraints if needed.
 	 */
@@ -1957,6 +2056,23 @@ ExecEndModifyTable(ModifyTableState *node)
 														   resultRelInfo);
 	}
 
+	/* Close all partitions and indices thereof */
+	for (i = 0; i < node->mt_num_partitions; i++)
+	{
+		ResultRelInfo *resultRelInfo = node->mt_partitions + i;
+
+		ExecCloseIndices(resultRelInfo);
+		heap_close(resultRelInfo->ri_RelationDesc, NoLock);
+
+		if (resultRelInfo->ri_FdwRoutine &&
+			resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL)
+			resultRelInfo->ri_FdwRoutine->EndForeignModify(node->ps.state,
+														   resultRelInfo);
+
+		if (node->mt_partition_tupconv_maps[i])
+			pfree(node->mt_partition_tupconv_maps[i]);
+	}
+
 	/*
 	 * Free the exprcontext
 	 */
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 58bfd49..9e5b60e 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -6149,6 +6149,65 @@ make_modifytable(PlannerInfo *root,
 	node->fdwPrivLists = fdw_private_list;
 	node->fdwDirectModifyPlans = direct_modify_plans;
 
+	/* Collect insert plans for all FDW-managed partitions */
+	if (node->operation == CMD_INSERT)
+	{
+		RangeTblEntry  *rte,
+					  **saved_simple_rte_array;
+		List		   *partition_oids;
+
+		Assert(list_length(resultRelations) == 1);
+		rte = rt_fetch(linitial_int(resultRelations), root->parse->rtable);
+		Assert(rte->rtekind == RTE_RELATION);
+
+		if (!relid_is_partitioned(rte->relid))
+			return node;
+
+		partition_oids = get_leaf_partitions(rte->relid, NoLock);
+
+		/* Discard any previous content which is useless anyway */
+		fdw_private_list = NIL;
+
+		/* To force FDW driver fetch the intended RTE */
+		saved_simple_rte_array = root->simple_rte_array;
+		root->simple_rte_array = (RangeTblEntry **)
+										palloc0(2 * sizeof(RangeTblEntry *));
+		foreach(lc, partition_oids)
+		{
+			Oid		myoid = lfirst_oid(lc);
+			FdwRoutine *fdwroutine;
+			List	   *fdw_private;
+
+			if (!oid_is_foreign_table(myoid))
+				continue;
+
+			fdwroutine = GetFdwRoutineByRelId(myoid);
+			if (fdwroutine && fdwroutine->PlanForeignModify)
+			{
+				RangeTblEntry *fdw_rte;
+
+				fdw_rte = copyObject(rte);
+				fdw_rte->relid = myoid;
+				fdw_rte->relkind = RELKIND_FOREIGN_TABLE;
+
+				/* Assumes PlanForeignModify() uses planner_rt_fetch(). */
+				root->simple_rte_array[1] = fdw_rte;
+
+				fdw_private = fdwroutine->PlanForeignModify(root, node, 1, 0);
+				pfree(fdw_rte);
+			}
+			else
+				fdw_private = NIL;
+
+			fdw_private_list = lappend(fdw_private_list, fdw_private);
+		}
+
+		pfree(root->simple_rte_array);
+		root->simple_rte_array = saved_simple_rte_array;
+
+		node->fdwPrivLists = fdw_private_list;
+	}
+
 	return node;
 }
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 9314074..95aca08 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1690,3 +1690,16 @@ has_row_triggers(PlannerInfo *root, Index rti, CmdType event)
 	heap_close(relation, NoLock);
 	return result;
 }
+
+bool
+oid_is_foreign_table(Oid relid)
+{
+	Relation	rel;
+	char		relkind;
+
+	rel = heap_open(relid, NoLock);
+	relkind = rel->rd_rel->relkind;
+	heap_close(rel, NoLock);
+
+	return relkind == RELKIND_FOREIGN_TABLE;
+}
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index b4fca37..6996666 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -777,8 +777,16 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
 
 	/* Process ON CONFLICT, if any. */
 	if (stmt->onConflictClause)
+	{
+		/* Bail out if target relation is partitioned table */
+		if (relid_is_partitioned(pstate->p_target_rangetblentry->relid))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("ON CONFLICT clause is not supported with partitioned tables")));
+
 		qry->onConflict = transformOnConflictClause(pstate,
 													stmt->onConflictClause);
+	}
 
 	/*
 	 * If we have a RETURNING clause, we need to add the target relation to
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index c1120e3..98e5dc0 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -14,6 +14,8 @@
 #define PARTITION_H
 
 #include "fmgr.h"
+#include "executor/tuptable.h"
+#include "nodes/execnodes.h"
 #include "parser/parse_node.h"
 #include "utils/relcache.h"
 
@@ -86,6 +88,22 @@ typedef struct PartitionDescData
 typedef struct PartitionDescData *PartitionDesc;
 
 /*
+ * PartitionKeyExecInfo
+ *
+ *		This struct holds the information needed to extract partition
+ *		column values from a heap tuple.
+ *
+ *		Key					copy of the rd_partkey of rel
+ *		ExpressionState		exec state for expressions, or NIL if none
+ */
+typedef struct PartitionKeyExecInfo
+{
+	NodeTag			type;
+	PartitionKey	pi_Key;
+	List		   *pi_ExpressionState;	/* list of ExprState */
+} PartitionKeyExecInfo;
+
+/*
  * Partition tree node
  *
  *	pkinfo					PartitionKey executor state
@@ -101,6 +119,7 @@ typedef struct PartitionDescData *PartitionDesc;
  */
 typedef struct PartitionDescNodeData
 {
+	PartitionKeyExecInfo *pkinfo;
 	PartitionDesc		pdesc;
 	Oid					relid;
 	int					index;
@@ -128,6 +147,7 @@ extern void StorePartitionBound(Oid relid, Oid parentId, Node *bound);
 extern void RemovePartitionEntryByRelId(Oid relid);
 extern bool relid_is_partition(Oid relid);
 extern Oid get_partition_parent(Oid relid);
+extern List *get_leaf_partitions(Oid relid, int lockmode);
 extern void RelationDropPartitions(Oid relid);
 extern PartitionInfo **get_partition_info(Relation rel, int *nparts);
 extern void free_partition_info(PartitionInfo **p, int num);
@@ -139,4 +159,9 @@ extern List *get_check_qual_from_partbound(Relation rel, Relation parent,
 										   Node *bound);
 extern List *get_leaf_partition_oids(PartitionDescNode pdnode);
 extern PartitionDescNode RelationGetPartitionDescNode(Relation rel);
+
+extern int get_partition_for_tuple(PartitionDescNode pdnode,
+					TupleTableSlot *slot,
+					EState *estate,
+					Oid *failed_at);
 #endif   /* PARTITION_H */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 39521ed..c3b0f6d 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -14,6 +14,7 @@
 #ifndef EXECUTOR_H
 #define EXECUTOR_H
 
+#include "catalog/partition.h"
 #include "executor/execdesc.h"
 #include "nodes/parsenodes.h"
 
@@ -188,6 +189,7 @@ extern void CheckValidResultRel(Relation resultRel, CmdType operation);
 extern void InitResultRelInfo(ResultRelInfo *resultRelInfo,
 				  Relation resultRelationDesc,
 				  Index resultRelationIndex,
+				  bool load_partition_check,
 				  int instrument_options);
 extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid);
 extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids);
@@ -211,6 +213,10 @@ extern void EvalPlanQualSetPlan(EPQState *epqstate,
 extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti,
 					 HeapTuple tuple);
 extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti);
+extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
+				  PartitionDescNode pdnode,
+				  TupleTableSlot *slot,
+				  EState *estate);
 
 #define EvalPlanQualSetSlot(epqstate, slot)  ((epqstate)->origslot = (slot))
 extern void EvalPlanQualFetchRowMarks(EPQState *epqstate);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 64af12c..e360616 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -16,6 +16,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/tupconvert.h"
 #include "executor/instrument.h"
 #include "lib/pairingheap.h"
 #include "nodes/params.h"
@@ -1140,6 +1141,15 @@ typedef struct ModifyTableState
 										 * tlist  */
 	TupleTableSlot *mt_conflproj;		/* CONFLICT ... SET ... projection
 										 * target */
+	struct PartitionDescNodeData *mt_partition_node;
+										/* Partition descriptor node tree */
+	ResultRelInfo  *mt_partitions;		/* Per leaf partition target
+										 * relations */
+	TupleConversionMap **mt_partition_tupconv_maps;
+										/* Per leaf partition
+										 * tuple conversion map */
+	int				mt_num_partitions;	/* Number of leaf partition target
+										 * relations in the above array */
 } ModifyTableState;
 
 /* ----------------
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h
index 125274e..fac606c 100644
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -56,5 +56,6 @@ extern Selectivity join_selectivity(PlannerInfo *root,
 				 SpecialJoinInfo *sjinfo);
 
 extern bool has_row_triggers(PlannerInfo *root, Index rti, CmdType event);
+extern bool oid_is_foreign_table(Oid relid);
 
 #endif   /* PLANCAT_H */
-- 
1.7.1

