Re: Moving delta data faster

From: yudhi s <learnerdatabase99(at)gmail(dot)com>
To: veem v <veema0000(at)gmail(dot)com>, Adrian Klaver <adrian(dot)klaver(at)aklaver(dot)com>, Greg Sabino Mullane <htamfids(at)gmail(dot)com>, pgsql-general <pgsql-general(at)lists(dot)postgresql(dot)org>
Subject: Re: Moving delta data faster
Date: 2024-04-06 15:47:45
Message-ID: CAEzWdqfww7aUkE+xpXXBM9eTkif1NxE_nGxeHsYPv+8-FY4pmQ@mail.gmail.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-general

Thank you Adrian, Greg and Veem.

I tried writing a small routine to see how the performance differs in these
four approaches i.e. Upsert VS traditional update+insert VS Merge vs
Truncate+load.

Initially I was thinking Upsert will perform the same as Merge as the logic
looks similar but it seems it's the worst performing among all, not sure
why , yet to know the reason though. Truncate+ load seems to be the best
performing among all. Hope i am doing it correctly. Please correct me if
I'm wrong.

UPSERT approach execution time: *00:00:20.921343*
UPSERT approach rows: 1000000

insert/update approach execution time: *00:00:15.53612*
insert/update approach update rows : 500000
insert/update approach Insert rows: 500000

MERGE approach execution time: *00:00:14.884623*
MERGE approach rows: 1000000

truncate load approach execution time:* 00:00:07.428826*
truncate load rows: 1000000

********* Routine ********

**************** UPSERT Testcase ***********
drop table source_tab;
drop table target_tab;

CREATE TABLE source_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

-- Create target table
CREATE TABLE target_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

INSERT INTO source_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 1000000) AS i;

INSERT INTO target_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 500000) AS i;

DO $$
DECLARE
start_time timestamp;
end_time timestamp;
rows_inserted integer:=0;
rows_updated integer:=0;
rows_upserted integer:=0;
rows_merged integer:=0;
BEGIN
-- Measure performance of UPSERT
start_time := clock_timestamp();
INSERT INTO target_tab (id, column1, column2)
SELECT id, column1, column2
FROM source_tab
ON CONFLICT (id) DO UPDATE
SET
column1 = EXCLUDED.column1,
column2 = EXCLUDED.column2;
get diagnostics rows_upserted=row_count;
end_time := clock_timestamp();
RAISE NOTICE 'UPSERT approach execution time: %', end_time - start_time;
RAISE NOTICE 'UPSERT approach rows: %', rows_upserted;

rollback;
END $$;

**************** Traditional Insert+update Testcase ***********
drop table source_tab;
drop table target_tab;

CREATE TABLE source_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

-- Create target table
CREATE TABLE target_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

INSERT INTO source_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 1000000) AS i;

INSERT INTO target_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 500000) AS i;

DO $$
DECLARE
start_time timestamp;
end_time timestamp;
rows_inserted integer:=0;
rows_updated integer:=0;
rows_upserted integer:=0;
rows_merged integer:=0;
BEGIN
-- Measure performance of insert/update approach
start_time := clock_timestamp();
-- Update existing records
UPDATE target_tab AS t
SET
column1 = s.column1,
column2 = s.column2
FROM source_tab AS s
WHERE t.id = s.id;
get diagnostics rows_updated=row_count;

-- Insert new records
INSERT INTO target_tab (id, column1, column2)
SELECT s.id, s.column1, s.column2
FROM source_tab AS s
LEFT JOIN target_tab AS t ON s.id = t.id
WHERE t.id IS NULL;
get diagnostics rows_inserted=row_count;

end_time := clock_timestamp();
RAISE NOTICE 'insert/update approach execution time: %', end_time -
start_time;
RAISE NOTICE 'insert/update approach update rows : %', rows_updated;
RAISE NOTICE 'insert/update approach Insert rows: %', rows_inserted;

rollback;
END $$;

**************** MERGE Testcase ***********
drop table source_tab;
drop table target_tab;

CREATE TABLE source_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

-- Create target table
CREATE TABLE target_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

INSERT INTO source_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 1000000) AS i;

INSERT INTO target_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 500000) AS i;

DO $$
DECLARE
start_time timestamp;
end_time timestamp;
rows_inserted integer:=0;
rows_updated integer:=0;
rows_upserted integer:=0;
rows_merged integer:=0;
begin

start_time := clock_timestamp();

merge into
target_tab t
using source_tab s on
t. id = s. id
when matched then
update
set column1 = s.column1,
column2 = s.column2
when not matched then
insert
values (id, column1, column2);
get diagnostics rows_merged=row_count;

end_time := clock_timestamp();
RAISE NOTICE 'MERGE approach execution time: %', end_time - start_time;
RAISE NOTICE 'MERGE approach rows: %', rows_merged;

rollback;
END $$;

**************** Truncate+load Testcase ***********
drop table source_tab;
drop table target_tab;

CREATE TABLE source_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

-- Create target table
CREATE TABLE target_tab (
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

INSERT INTO source_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 1000000) AS i;

INSERT INTO target_tab (column1, column2)
SELECT
'Value ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 500000) AS i;

DO $$
DECLARE
start_time timestamp;
end_time timestamp;
rows_inserted integer:=0;
rows_updated integer:=0;
rows_upserted integer:=0;
rows_truncate_loaded integer:=0;
begin

start_time := clock_timestamp();

truncate table target_tab;

INSERT INTO target_tab (id, column1, column2)
SELECT s.id, s.column1, s.column2
FROM source_tab AS s;
get diagnostics rows_truncate_loaded=row_count;

end_time := clock_timestamp();
RAISE NOTICE 'truncate load approach execution time: %', end_time -
start_time;
RAISE NOTICE 'truncate load rows: %', rows_truncate_loaded;

rollback;
END $$;

In response to

Responses

Browse pgsql-general by date

  From Date Subject
Next Message Adrian Klaver 2024-04-06 16:55:47 Re: Moving delta data faster
Previous Message veem v 2024-04-06 14:02:43 Re: Moving delta data faster