CREATE TABLE pd.d_sample ( unit_id smallint NOT NULL, project_id integer NOT NULL, subject_id text NOT NULL, ref_date text NOT NULL, dep_var double precision, ind_vars double precision[], grad_vars character varying[], trn_vars double precision[], sample smallint, score double precision, tree_bag bit varying, scores double precision[], CONSTRAINT d_sample_pk PRIMARY KEY (unit_id, project_id, subject_id, ref_date) ); CREATE OR REPLACE FUNCTION dt.c_create_tree( p_unit_id smallint, p_project_id integer, p_permutation_id smallint, p_tree_ord smallint, p_samples smallint[], p_in_forest boolean, p_criteria smallint[], p_one_node_per_crit boolean, p_min_population integer, p_max_level integer, p_min_ar double precision) RETURNS void AS $BODY$ DECLARE v_tree_nodes dt.tree_node[]; BEGIN -- Grow the tree v_tree_nodes := dt.p_create_node( p_unit_id, p_project_id, p_samples[1], p_tree_ord, p_in_forest, p_criteria, p_one_node_per_crit, p_min_population, p_max_level, p_min_ar, NULL, NULL, NULL, NULL, NULL ); -- Store it in the table INSERT INTO pd.d_tree ( unit_id, project_id, permutation_id, tree_ord, ind_var_ids, nodes ) VALUES( p_unit_id, p_project_id, p_permutation_id, p_tree_ord, p_criteria, v_tree_nodes ) ON CONFLICT ON CONSTRAINT d_tree_pk DO UPDATE SET ind_var_ids = p_criteria, nodes = v_tree_nodes; -- Calculate AR of training sample PERFORM dt.p_evaluate_tree( p_unit_id, p_project_id, p_permutation_id, p_tree_ord, p_samples, p_in_forest, v_tree_nodes ); END; $BODY$ LANGUAGE plpgsql VOLATILE COST 100; CREATE OR REPLACE FUNCTION dt.p_create_node( IN p_unit_id smallint, IN p_project_id integer, IN p_sample smallint, IN p_tree_ord integer, IN p_in_forest boolean, IN p_criteria smallint[], IN p_one_node_per_crit boolean, IN p_min_population integer, IN p_max_level integer, IN p_min_ar double precision, IN p_crit_ord smallint, IN p_bound double precision, IN p_id text, IN p_view text, INOUT p_tree_nodes dt.tree_node[]) RETURNS dt.tree_node[] AS $BODY$ DECLARE r record; v_id text; v_view text; v_node_view text; v_bins pd.crit_bin[]; v_nodes pd.crit_node[]; v_bounds pd.crit_bound[]; v_ar float; v_best_ord smallint; v_best_ar float := -1; v_best_bounds pd.crit_bound[]; v_best_cont boolean; v_bound pd.crit_bound; v_node dt.tree_node; v_df float; v_obs integer; v_def float; BEGIN -- If new tree initialize the text of the view IF p_view IS NULL THEN v_view := 'CREATE OR REPLACE TEMP VIEW sample AS SELECT ind_vars,dep_var FROM pd.d_sample ' || 'WHERE unit_id=' || p_unit_id || ' AND project_id=' || p_project_id || ' AND sample =' || p_sample; IF p_in_forest THEN -- In case a tree of a random forest v_view := v_view || ' AND get_bit(tree_bag,' || (p_tree_ord - 1) || ')=1'; END IF; v_id := '_'; p_tree_nodes := ARRAY[]::dt.tree_node[]; ELSE v_view := p_view; v_id := p_id; END IF; -- Create the view EXECUTE v_view; -- Get number of observations and defaults of the view SELECT count(*), sum(dep_var) INTO v_obs, v_def FROM sample; -- Build a node v_node := (v_id, p_bound, v_obs, v_def, NULL, NULL, NULL)::dt.tree_node; -- Check if max level is reached or if no defaults exist IF (char_length(v_id) - 1) / 2 >= p_max_level OR v_def <= 0.5 THEN p_tree_nodes := p_tree_nodes || v_node; RETURN; END IF; v_df := v_def / v_obs; -- Check for the best criterion to split FOR r IN SELECT ord, (discrete_values IS NULL OR monotone) AS continuous, monotone, dir, min_ar, nan_behavior, null_behavior, max_p_value, max_level FROM pd.d_criterion WHERE unit_id = p_unit_id AND project_id = p_project_id AND ord = ANY(p_criteria) ORDER BY ord LOOP IF r.continuous THEN -- Continuous variables SELECT * FROM dt.p_bin_continuous(r.ord, r.nan_behavior, r.null_behavior, p_min_population, r.max_p_value, r.max_level, r.monotone, r.dir, v_df) INTO v_bins, v_bounds, v_nodes; ELSE -- Discrete variables SELECT * FROM dt.p_bin_discrete(r.ord, p_min_population, r.max_p_value, v_df) INTO v_bins, v_bounds, v_nodes; END IF; -- Check if no split CONTINUE WHEN v_bins IS NULL OR array_length(v_bins, 1) = 1; -- Calculate AR v_ar := pd.p_calc_ar(v_bins); IF v_ar > v_best_ar THEN v_best_ord := r.ord; v_best_bounds := v_bounds; v_best_ar := v_ar; v_best_cont := r.continuous; END IF; END LOOP; -- If no AR improvement then don't split IF v_best_bounds IS NULL OR array_length(v_best_bounds, 1) <= 1 OR v_best_ar <= p_min_ar THEN p_tree_nodes := p_tree_nodes || v_node; RETURN; END IF; -- Set the node child_ord and ar v_node.child_ord := v_best_ord; v_node.child_discr := NOT v_best_cont; v_node.ar := v_best_ar; -- Add the node p_tree_nodes := p_tree_nodes || v_node; -- Build children nodes FOR i IN 1..array_length(v_best_bounds, 1) LOOP v_bound := v_best_bounds[i]; v_node_view := v_view || ' AND ind_vars[' || v_best_ord || ']' || CASE WHEN (v_bound).bound IS NULL THEN ' IS NULL' WHEN (v_bound).bound = 'NaN'::float THEN '=''NaN''::float' WHEN NOT v_best_cont THEN '=' || (v_bound).bound WHEN i = 1 THEN '<=' || CASE WHEN (v_bound).bound = 'Infinity'::float THEN '''Infinity''::float' ELSE (v_bound).bound::text END WHEN (v_bound).bound = 'Infinity'::float THEN '>' || (v_best_bounds[i - 1]).bound ELSE '>' || (v_best_bounds[i - 1]).bound || ' AND ind_vars[' || v_best_ord || ']<=' || (v_bound).bound END; p_tree_nodes := dt.p_create_node( p_unit_id, p_project_id, p_sample, p_tree_ord, p_in_forest, CASE WHEN p_one_node_per_crit THEN array_remove(p_criteria, v_best_ord) ELSE p_criteria END, p_one_node_per_crit, p_min_population, p_max_level, p_min_ar, v_best_ord, (v_best_bounds[i]).bound, v_id || to_char(i, 'FM00'), v_node_view, p_tree_nodes ); END LOOP; END; $BODY$ LANGUAGE plpgsql VOLATILE COST 100;