From 56241895578e0d16d66518b8470005779cd2132c Mon Sep 17 00:00:00 2001 From: Alexandra Wang Date: Tue, 14 Jan 2025 15:14:35 -0600 Subject: [PATCH] [WIP] Operator apporach: JSONB dot notation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement an operator-like function `jsonb_object_field_dot()` that accesses JSONB object fields one at a time. Array unwrapping (lax mode) is handled differently depending on whether the operation is the first in a chain of indirections. Similarly, conditional wrapping is handled differently depending on whether the operation is the last in that chain. TODO: - Currently, this commit might generate incorrect results when mixing dot notation access with existing subscripting access. Fixing this should not be difficult. However, performance comparisons with the alternative approach have led to postponing the fix. - Ideally, the function would use JsonPath or a pipelined chain of operators instead of handling each dot access individually—essentially what JsonPathQuery() does—leading to the alternative approach. Performance comparison with the generic subscripting approach: -- setup: create table tbl(id int, col1 jsonb); insert into tbl select i, '{"x":"vx", "y":[{"a":[1,2,3]}, {"b":[1, 2, {"j":"vj"}]}]}' from generate_series(1, 100000)i; -- query 1: chained dot access SELECT id, (col1).x.b.j AS jsonb_operator FROM tbl; -- pgbench -n -f .sql test -T100 This patch: number of transactions actually processed: 6620 latency average = 15.107 ms tps = 66.193017 (without initial connection time) Nikita's patch (generic subscripting using json_query()): number of transactions actually processed: 8036 latency average = 12.445 ms tps = 80.352075 (without initial connection time) It is expected that this patch is less performant because it currently deserializes and serializes the nested JSONB binary three times (once for each invocation of the operator-like function). -- query 2: single dot access SELECT id, (col1).x AS jsonb_operator FROM tbl; -- pgbench -n -f .sql test -T100 This patch: number of transactions actually processed: 5653 latency average = 17.691 ms tps = 56.524496 (without initial connection time) Nikita's patch: number of transactions actually processed: 4989 latency average = 20.044 ms tps = 49.890189 (without initial connection time) This patch performs slightly better for single dot accesses, though the margin is not significant. --- src/backend/parser/parse_expr.c | 33 +++- src/backend/parser/parse_func.c | 36 ++++ src/backend/utils/adt/jsonfuncs.c | 262 ++++++++++++++++++++++++++++ src/include/catalog/pg_operator.dat | 2 +- src/include/catalog/pg_proc.dat | 4 + src/include/parser/parse_func.h | 2 + src/test/regress/expected/jsonb.out | 103 +++++++++++ src/test/regress/sql/jsonb.sql | 47 +++++ 8 files changed, 481 insertions(+), 8 deletions(-) diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index bad1df732ea..e3b6626983b 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -440,6 +440,8 @@ transformIndirection(ParseState *pstate, A_Indirection *ind) List *subscripts = NIL; int location = exprLocation(result); ListCell *i; + bool json_accessor_chain_first = false; + bool json_accessor_chain_last = false; /* * We have to split any field-selection operations apart from @@ -462,6 +464,8 @@ transformIndirection(ParseState *pstate, A_Indirection *ind) else { Node *newresult; + Oid result_typid; + Oid result_basetypid; Assert(IsA(n, String)); @@ -475,13 +479,28 @@ transformIndirection(ParseState *pstate, A_Indirection *ind) false); subscripts = NIL; - newresult = ParseFuncOrColumn(pstate, - list_make1(n), - list_make1(result), - last_srf, - NULL, - false, - location); + result_typid = exprType(result); + result_basetypid = (result_typid == JSONOID || result_typid == JSONBOID) ? + result_typid : getBaseType(result_typid); + + if (result_basetypid == JSONBOID) + { + json_accessor_chain_first = (i == list_head(ind->indirection)); + if (lnext(ind->indirection, i) == NULL) + json_accessor_chain_last = true; + newresult = ParseJsonbSimplifiedAccessorObjectField(pstate, + strVal(n), + result, + location, result_basetypid, json_accessor_chain_first, json_accessor_chain_last); + } + else + newresult = ParseFuncOrColumn(pstate, + list_make1(n), + list_make1(result), + last_srf, + NULL, + false, + location); if (newresult == NULL) unknown_attribute(pstate, result, strVal(n), location); result = newresult; diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 583bbbf232f..dba2a60fadc 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -1902,6 +1902,42 @@ FuncNameAsType(List *funcname) return result; } +/* + * ParseJsonbSimplifiedAccessorObjectField - + * handles function calls with a single argument that is of json type. + * If the function call is actually a column projection, return a suitably + * transformed expression tree. If not, return NULL. + */ +Node * +ParseJsonbSimplifiedAccessorObjectField(ParseState *pstate, const char *funcname, Node *first_arg, int location, + Oid basetypid, bool first_op, bool last_op) +{ + FuncExpr *result; + Node *rexpr; + + if (basetypid != JSONBOID) + elog(ERROR, "unsupported type OID: %u", basetypid); + + rexpr = (Node *) makeConst( + TEXTOID, + -1, + InvalidOid, + -1, + CStringGetTextDatum(funcname), + false, + false); + result = makeFuncExpr(4100, + JSONBOID, + list_make4(first_arg, rexpr, + makeBoolConst(first_op, false), + makeBoolConst(last_op, false)), + InvalidOid, + InvalidOid, + COERCE_EXPLICIT_CALL); + + return (Node *) result; +} + /* * ParseComplexProjection - * handles function calls with a single argument that is of complex type. diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index c2e90f1a3bf..a32aef3bfac 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -879,6 +879,268 @@ jsonb_object_field(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } +static void +expand_jbvBinary_in_memory(const JsonbValue *binaryVal, JsonbValue *expandedVal) +{ + Assert(binaryVal->type == jbvBinary); + JsonbContainer *container = (JsonbContainer *) binaryVal->val.binary.data; + JsonbIterator *it; + JsonbValue v; + JsonbIteratorToken token; + + it = JsonbIteratorInit(container); + token = JsonbIteratorNext(&it, &v, false); + + if (token == WJB_BEGIN_OBJECT) + { + /* + * We'll read all keys/values until WJB_END_OBJECT and build + * expandedVal->type = jbvObject. + */ + List *keyvals = NIL; + + while ((token = JsonbIteratorNext(&it, &v, false)) != WJB_END_OBJECT) + { + if (token == WJB_KEY) + { + JsonbValue key; + + key = v; + token = JsonbIteratorNext(&it, &v, true); + if (token == WJB_VALUE) + { + JsonbValue val; + JsonbValue *objPair; + + if (v.type == jbvBinary) + { + /* Recursively expand nested objects/arrays. */ + expand_jbvBinary_in_memory(&v, &val); + } + else + { + /* Scalar or already jbvObject/jbvArray. Copy as-is. */ + val = v; + } + + /* + * Build a small pair (key, value). We'll store them in a + * list. + */ + objPair = palloc(sizeof(JsonbValue) * 2); + objPair[0] = key; + objPair[1] = val; + keyvals = lappend(keyvals, objPair); + } + } + } + + /* Now convert our keyvals list into a jbvObject. */ + int nPairs = list_length(keyvals); + + expandedVal->type = jbvObject; + expandedVal->val.object.nPairs = nPairs; + expandedVal->val.object.pairs = palloc(sizeof(JsonbPair) * nPairs); + + { + int i = 0; + ListCell *lc; + + foreach(lc, keyvals) + { + JsonbValue *kv = (JsonbValue *) lfirst(lc); + + /* kv[0] = key, kv[1] = value */ + + expandedVal->val.object.pairs[i].key = kv[0]; + expandedVal->val.object.pairs[i].value = kv[1]; + expandedVal->val.object.pairs[i].order = i; + i++; + } + } + } + else if (token == WJB_BEGIN_ARRAY) + { + /* + * We'll read array elems until WJB_END_ARRAY and build + * expandedVal->type = jbvArray. + */ + List *elems = NIL; + + while ((token = JsonbIteratorNext(&it, &v, true)) != WJB_END_ARRAY) + { + if (token == WJB_ELEM) + { + /* If it's jbvBinary, recursively expand again. */ + JsonbValue val; + + if (v.type == jbvBinary) + { + expand_jbvBinary_in_memory(&v, &val); + } + else + { + val = v; + } + JsonbValue *elemCopy = palloc(sizeof(JsonbValue)); + + *elemCopy = val; + elems = lappend(elems, elemCopy); + } + } + + expandedVal->type = jbvArray; + expandedVal->val.array.nElems = list_length(elems); + expandedVal->val.array.rawScalar = false; + expandedVal->val.array.elems = palloc(sizeof(JsonbValue) * expandedVal->val.array.nElems); + + { + int i = 0; + ListCell *lc; + + foreach(lc, elems) + { + JsonbValue *vptr = (JsonbValue *) lfirst(lc); + + expandedVal->val.array.elems[i++] = *vptr; + } + } + } + else + { + /* + * Possibly a scalar container (WJB_ELEM or WJB_VALUE with jbvNumeric, + * jbvString, etc.). If this container truly only has one scalar, + * token might be WJB_ELEM or similar. For simplicity, let's check + * tmp.type. If it's jbvString/jbvNumeric, copy it directly. + */ + expandedVal->type = v.type; + expandedVal->val = v.val; + } +} + +static List * +jsonb_object_field_unwrap_array(JsonbContainer *jb, text *key, bool unwrap_nested) +{ + JsonbIterator *it; + JsonbValue v; + JsonbIteratorToken token; + List *resultList = NIL; + int arraySize; + + it = JsonbIteratorInit(jb); + token = JsonbIteratorNext(&it, &v, false); + + Assert(token == WJB_BEGIN_ARRAY); + arraySize = v.val.array.nElems; + + /* Unwrap out-most array elements and extract the key value */ + for (int i = 0; i < arraySize; i++) + { + token = JsonbIteratorNext(&it, &v, true); + if (token == WJB_ELEM && v.type == jbvBinary) + { + JsonbContainer *elemContainer = (JsonbContainer *) v.val.binary.data; + + if (JsonContainerIsObject(elemContainer)) + { + JsonbValue *extractedValue; + JsonbValue vbuf; + + extractedValue = getKeyJsonValueFromContainer(elemContainer, + VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key), + &vbuf); + if (extractedValue != NULL) + { + JsonbValue *copy; + + if (extractedValue->type == jbvBinary) + { + JsonbValue expanded; + + expand_jbvBinary_in_memory(extractedValue, &expanded); + + copy = palloc(sizeof(expanded)); + *copy = expanded; + resultList = lappend(resultList, copy); + } + else + { + copy = palloc(sizeof(*extractedValue)); + *copy = *extractedValue; + resultList = lappend(resultList, copy); + } + } + } + else if (unwrap_nested && JsonContainerIsArray(elemContainer)) + { + resultList = jsonb_object_field_unwrap_array(elemContainer, key, false); + } + } + } + token = JsonbIteratorNext(&it, &v, true); + + return resultList; +} + +Datum +jsonb_object_field_dot(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB_P(0); + text *key = PG_GETARG_TEXT_PP(1); + bool first_op = PG_GETARG_BOOL(2); + bool last_op = PG_GETARG_BOOL(3); + + if (JB_ROOT_IS_OBJECT(jb)) + { + JsonbValue *v; + JsonbValue vbuf; + + v = getKeyJsonValueFromContainer(&jb->root, + VARDATA_ANY(key), + VARSIZE_ANY_EXHDR(key), + &vbuf); + + if (v != NULL) + PG_RETURN_JSONB_P(JsonbValueToJsonb(v)); + } + else if (JB_ROOT_IS_ARRAY(jb)) + { + List *resultList; + JsonbValue resultVal; + + resultList = jsonb_object_field_unwrap_array(&jb->root, key, !first_op); + + if (list_length(resultList) == 0) + PG_RETURN_NULL(); + else if (!last_op || list_length(resultList) > 1) + { + /* Conditional wrap result */ + resultVal.type = jbvArray; + resultVal.val.array.rawScalar = false; + resultVal.val.array.nElems = list_length(resultList); + resultVal.val.array.elems = (JsonbValue *) palloc(sizeof(JsonbValue) * list_length(resultList)); + + int idx = 0; + ListCell *lc; + JsonbValue *elem; + + foreach(lc, resultList) + { + elem = (JsonbValue *) lfirst(lc); + resultVal.val.array.elems[idx++] = *elem; + } + } + else + resultVal = *(JsonbValue *) linitial(resultList); + + PG_RETURN_JSONB_P(JsonbValueToJsonb(&resultVal)); + } + + PG_RETURN_NULL(); +} + Datum json_object_field_text(PG_FUNCTION_ARGS) { diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat index 6d9dc1528d6..70887d3fd4d 100644 --- a/src/include/catalog/pg_operator.dat +++ b/src/include/catalog/pg_operator.dat @@ -3173,7 +3173,7 @@ { oid => '3967', descr => 'get value from json as text with path elements', oprname => '#>>', oprleft => 'json', oprright => '_text', oprresult => 'text', oprcode => 'json_extract_path_text' }, -{ oid => '3211', descr => 'get jsonb object field', +{ oid => '3211', oid_symbol => 'OID_JSONB_OBJECT_FIELD_OP', descr => 'get jsonb object field', oprname => '->', oprleft => 'jsonb', oprright => 'text', oprresult => 'jsonb', oprcode => 'jsonb_object_field' }, { oid => '3477', descr => 'get jsonb object field as text', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 5b8c2ad2a54..cabde2e07e8 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -9338,6 +9338,10 @@ { oid => '3968', descr => 'get the type of a json value', proname => 'json_typeof', prorettype => 'text', proargtypes => 'json', prosrc => 'json_typeof' }, +{ oid => '4100', + proname => 'jsonb_object_field_dot', prorettype => 'jsonb', + proargtypes => 'jsonb text bool bool', proargnames => '{from_json, field_name, first_op, last_op}', + prosrc => 'jsonb_object_field_dot' }, # uuid { oid => '2952', descr => 'I/O', diff --git a/src/include/parser/parse_func.h b/src/include/parser/parse_func.h index a6f24b83d84..745d2c75c93 100644 --- a/src/include/parser/parse_func.h +++ b/src/include/parser/parse_func.h @@ -34,6 +34,8 @@ typedef enum extern Node *ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, Node *last_srf, FuncCall *fn, bool proc_call, int location); +extern Node *ParseJsonbSimplifiedAccessorObjectField(ParseState *pstate, const char *funcname, Node *first_arg, int location, + Oid basetypid, bool first_op, bool last_op); extern FuncDetailCode func_get_detail(List *funcname, List *fargs, List *fargnames, diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out index 2baff931bf2..e0c0757f475 100644 --- a/src/test/regress/expected/jsonb.out +++ b/src/test/regress/expected/jsonb.out @@ -5781,3 +5781,106 @@ select '12345.0000000000000000000000000000000000000000000005'::jsonb::int8; 12345 (1 row) +-- simple dot notation +-- TODO: add comments +CREATE OR REPLACE FUNCTION test_jsonb_dot_notation( + vep jsonb, -- value expression primary + jc text -- JSON simplified accessor operator chain +) + RETURNS TABLE(dot_access jsonb, expected jsonb) + LANGUAGE plpgsql +AS $$ +DECLARE + dyn_sql text; +BEGIN + dyn_sql := format($f$ + SELECT + (vep).%s AS dot_access, + json_query(vep, 'lax $.%s' WITH CONDITIONAL WRAPPER NULL ON EMPTY NULL ON ERROR) AS expected + FROM (SELECT $1::jsonb AS vep) dummy + $f$, jc, jc); + + -- -- OPTIONAL: Just to see the constructed SQL in logs +-- RAISE NOTICE 'Executing: %', dyn_sql; + + -- Execute the dynamic query, substituting p_col as parameter #1 + RETURN QUERY EXECUTE dyn_sql USING vep; +END; +$$; +-- access member object field of a json object +select * from test_jsonb_dot_notation('{"a": 1, "b": 42}'::jsonb, 'b'); + dot_access | expected +------------+---------- + 42 | 42 +(1 row) + +select * from test_jsonb_dot_notation('{"a": 1, "b": 42}'::jsonb, 'not_exist'); + dot_access | expected +------------+---------- + | +(1 row) + +select * from test_jsonb_dot_notation('{"a": 1, "b": 42, "b":12}'::jsonb, 'b'); -- return last for duplicate key + dot_access | expected +------------+---------- + 12 | 12 +(1 row) + +select * from test_jsonb_dot_notation('{"a": 1, "b": 12, "b":42}'::jsonb, 'b'); + dot_access | expected +------------+---------- + 42 | 42 +(1 row) + +select * from test_jsonb_dot_notation('{"a": 2, "b": {"c": 42}}'::jsonb, 'b.c'); + dot_access | expected +------------+---------- + 42 | 42 +(1 row) + +select * from test_jsonb_dot_notation('{"a": 4, "b": {"c": {"d": [11, 12]}}}'::jsonb, 'b.c.d'); + dot_access | expected +------------+---------- + [11, 12] | [11, 12] +(1 row) + +-- access member object field of a json array: apply lax mode + conditional wrap +-- unwrap the outer-most array into sequence and conditional wrap the results +-- only unwrap the outer most array +select * from test_jsonb_dot_notation('[{"x": 42}]'::jsonb, 'x'); + dot_access | expected +------------+---------- + 42 | 42 +(1 row) + +select * from test_jsonb_dot_notation('["x"]'::jsonb, 'x'); + dot_access | expected +------------+---------- + | +(1 row) + +select * from test_jsonb_dot_notation('[[{"x": 42}]]'::jsonb, 'x'); + dot_access | expected +------------+---------- + | +(1 row) + +-- wrap the result into an array on the conditional of more than one matched object keys +select * from test_jsonb_dot_notation('[{"x": 42}, {"x": {"y": {"z": 12}}}]'::jsonb, 'x'); + dot_access | expected +------------------------+------------------------ + [42, {"y": {"z": 12}}] | [42, {"y": {"z": 12}}] +(1 row) + +select * from test_jsonb_dot_notation('[{"x": 42}, [{"x": {"y": {"z": 12}}}]]'::jsonb, 'x'); + dot_access | expected +------------+---------- + 42 | 42 +(1 row) + +select * from test_jsonb_dot_notation('[{"x": 42}, {"x": [{"y": 12}, {"y": {"z": 12}}]}]'::jsonb, 'x.y'); + dot_access | expected +-----------------+----------------- + [12, {"z": 12}] | [12, {"z": 12}] +(1 row) + diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql index 544bb610e2d..5a6d820b507 100644 --- a/src/test/regress/sql/jsonb.sql +++ b/src/test/regress/sql/jsonb.sql @@ -1572,3 +1572,50 @@ select '12345.0000000000000000000000000000000000000000000005'::jsonb::float8; select '12345.0000000000000000000000000000000000000000000005'::jsonb::int2; select '12345.0000000000000000000000000000000000000000000005'::jsonb::int4; select '12345.0000000000000000000000000000000000000000000005'::jsonb::int8; + +-- simple dot notation +-- TODO: add comments + +CREATE OR REPLACE FUNCTION test_jsonb_dot_notation( + vep jsonb, -- value expression primary + jc text -- JSON simplified accessor operator chain +) + RETURNS TABLE(dot_access jsonb, expected jsonb) + LANGUAGE plpgsql +AS $$ +DECLARE + dyn_sql text; +BEGIN + dyn_sql := format($f$ + SELECT + (vep).%s AS dot_access, + json_query(vep, 'lax $.%s' WITH CONDITIONAL WRAPPER NULL ON EMPTY NULL ON ERROR) AS expected + FROM (SELECT $1::jsonb AS vep) dummy + $f$, jc, jc); + + -- -- OPTIONAL: Just to see the constructed SQL in logs +-- RAISE NOTICE 'Executing: %', dyn_sql; + + -- Execute the dynamic query, substituting p_col as parameter #1 + RETURN QUERY EXECUTE dyn_sql USING vep; +END; +$$; + +-- access member object field of a json object +select * from test_jsonb_dot_notation('{"a": 1, "b": 42}'::jsonb, 'b'); +select * from test_jsonb_dot_notation('{"a": 1, "b": 42}'::jsonb, 'not_exist'); +select * from test_jsonb_dot_notation('{"a": 1, "b": 42, "b":12}'::jsonb, 'b'); -- return last for duplicate key +select * from test_jsonb_dot_notation('{"a": 1, "b": 12, "b":42}'::jsonb, 'b'); +select * from test_jsonb_dot_notation('{"a": 2, "b": {"c": 42}}'::jsonb, 'b.c'); +select * from test_jsonb_dot_notation('{"a": 4, "b": {"c": {"d": [11, 12]}}}'::jsonb, 'b.c.d'); + +-- access member object field of a json array: apply lax mode + conditional wrap +-- unwrap the outer-most array into sequence and conditional wrap the results +-- only unwrap the outer most array +select * from test_jsonb_dot_notation('[{"x": 42}]'::jsonb, 'x'); +select * from test_jsonb_dot_notation('["x"]'::jsonb, 'x'); +select * from test_jsonb_dot_notation('[[{"x": 42}]]'::jsonb, 'x'); +-- wrap the result into an array on the conditional of more than one matched object keys +select * from test_jsonb_dot_notation('[{"x": 42}, {"x": {"y": {"z": 12}}}]'::jsonb, 'x'); +select * from test_jsonb_dot_notation('[{"x": 42}, [{"x": {"y": {"z": 12}}}]]'::jsonb, 'x'); +select * from test_jsonb_dot_notation('[{"x": 42}, {"x": [{"y": 12}, {"y": {"z": 12}}]}]'::jsonb, 'x.y'); -- 2.39.5 (Apple Git-154)