Re: Performance of Query 4 on TPC-DS Benchmark

From: Alena Rybakina <a(dot)rybakina(at)postgrespro(dot)ru>
To: Ba Jinsheng <bajinsheng(at)u(dot)nus(dot)edu>, "pgsql-performance(at)lists(dot)postgresql(dot)org" <pgsql-performance(at)lists(dot)postgresql(dot)org>, "lepihov(at)gmail(dot)com" <lepihov(at)gmail(dot)com>
Subject: Re: Performance of Query 4 on TPC-DS Benchmark
Date: 2024-11-10 20:18:47
Message-ID: 8288344e-5cb7-4a02-bb30-f771e9cfde58@postgrespro.ru
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-performance

On 10.11.2024 23:16, Alena Rybakina wrote:
>
> Hi!
>
> On 10.11.2024 22:35, Ba Jinsheng wrote:
>> Hi all,
>>
>> Please see this case:
>>
>>
>> Query 4 on TPC-DS benchmark:
>>
>> with year_total as (
>>  select c_customer_id customer_id
>>        ,c_first_name customer_first_name
>>        ,c_last_name customer_last_name
>>        ,c_preferred_cust_flag customer_preferred_cust_flag
>>        ,c_birth_country customer_birth_country
>>        ,c_login customer_login
>>        ,c_email_address customer_email_address
>>        ,d_year dyear
>>  ,sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2)
>> year_total
>>        ,'s' sale_type
>>  from customer
>>      ,store_sales
>>      ,date_dim
>>  where c_customer_sk = ss_customer_sk
>>    and ss_sold_date_sk = d_date_sk
>>  group by c_customer_id
>>          ,c_first_name
>>          ,c_last_name
>>          ,c_preferred_cust_flag
>>          ,c_birth_country
>>          ,c_login
>>          ,c_email_address
>>          ,d_year
>>  union all
>>  select c_customer_id customer_id
>>        ,c_first_name customer_first_name
>>        ,c_last_name customer_last_name
>>        ,c_preferred_cust_flag customer_preferred_cust_flag
>>        ,c_birth_country customer_birth_country
>>        ,c_login customer_login
>>        ,c_email_address customer_email_address
>>        ,d_year dyear
>>  ,sum((((cs_ext_list_price-cs_ext_wholesale_cost-cs_ext_discount_amt)+cs_ext_sales_price)/2)
>> ) year_total
>>        ,'c' sale_type
>>  from customer
>>      ,catalog_sales
>>      ,date_dim
>>  where c_customer_sk = cs_bill_customer_sk
>>    and cs_sold_date_sk = d_date_sk
>>  group by c_customer_id
>>          ,c_first_name
>>          ,c_last_name
>>          ,c_preferred_cust_flag
>>          ,c_birth_country
>>          ,c_login
>>          ,c_email_address
>>          ,d_year
>> union all
>>  select c_customer_id customer_id
>>        ,c_first_name customer_first_name
>>        ,c_last_name customer_last_name
>>        ,c_preferred_cust_flag customer_preferred_cust_flag
>>        ,c_birth_country customer_birth_country
>>        ,c_login customer_login
>>        ,c_email_address customer_email_address
>>        ,d_year dyear
>>  ,sum((((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2)
>> ) year_total
>>        ,'w' sale_type
>>  from customer
>>      ,web_sales
>>      ,date_dim
>>  where c_customer_sk = ws_bill_customer_sk
>>    and ws_sold_date_sk = d_date_sk
>>  group by c_customer_id
>>          ,c_first_name
>>          ,c_last_name
>>          ,c_preferred_cust_flag
>>          ,c_birth_country
>>          ,c_login
>>          ,c_email_address
>>          ,d_year
>>          )
>>   select
>>                   t_s_secyear.customer_id
>>                  ,t_s_secyear.customer_first_name
>>                  ,t_s_secyear.customer_last_name
>>                  ,t_s_secyear.customer_email_address
>>  from year_total t_s_firstyear
>>      ,year_total t_s_secyear
>>      ,year_total t_c_firstyear
>>      ,year_total t_c_secyear
>>      ,year_total t_w_firstyear
>>      ,year_total t_w_secyear
>>  where t_s_secyear.customer_id = t_s_firstyear.customer_id
>>    and t_s_firstyear.customer_id = t_c_secyear.customer_id
>>    and t_s_firstyear.customer_id = t_c_firstyear.customer_id
>>    and t_s_firstyear.customer_id = t_w_firstyear.customer_id
>>    and t_s_firstyear.customer_id = t_w_secyear.customer_id
>>    and t_s_firstyear.sale_type = 's'
>>    and t_c_firstyear.sale_type = 'c'
>>    and t_w_firstyear.sale_type = 'w'
>>    and t_s_secyear.sale_type = 's'
>>    and t_c_secyear.sale_type = 'c'
>>    and t_w_secyear.sale_type = 'w'
>>    and t_s_firstyear.dyear =  2001
>>    and t_s_secyear.dyear = 2001+1
>>    and t_c_firstyear.dyear =  2001
>>    and t_c_secyear.dyear =  2001+1
>>    and t_w_firstyear.dyear = 2001
>>    and t_w_secyear.dyear = 2001+1
>>    and t_s_firstyear.year_total > 0
>>    and t_c_firstyear.year_total > 0
>>    and t_w_firstyear.year_total > 0
>>    and case when t_c_firstyear.year_total > 0 then
>> t_c_secyear.year_total / t_c_firstyear.year_total else null end
>>            > case when t_s_firstyear.year_total > 0 then
>> t_s_secyear.year_total / t_s_firstyear.year_total else null end
>>    and case when t_c_firstyear.year_total > 0 then
>> t_c_secyear.year_total / t_c_firstyear.year_total else null end
>>            > case when t_w_firstyear.year_total > 0 then
>> t_w_secyear.year_total / t_w_firstyear.year_total else null end
>>  order by t_s_secyear.customer_id
>>          ,t_s_secyear.customer_first_name
>>          ,t_s_secyear.customer_last_name
>>          ,t_s_secyear.customer_email_address
>> limit 100;
>>
>>
>>
>>
>>
>> The execution time is more than 50 minutes:
>>                                                    QUERY PLAN
>> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>  Limit  (cost=1255378.56..1255378.57 rows=1 width=132) (actual
>> time=3024403.311..3024403.342 rows=8 loops=1)
>>    CTE year_total
>>      ->  Append  (cost=197433.23..461340.62 rows=5041142 width=216)
>> (actual time=4126.043..7897.747 rows=384208 loops=1)
>>            ->  HashAggregate  (cost=197433.23..233436.60 rows=2880270
>> width=216) (actual time=4126.042..4231.703 rows=190581 loops=1)
>>                  Group Key: customer.c_customer_id,
>> customer.c_first_name, customer.c_last_name,
>> customer.c_preferred_cust_flag, customer.c_birth_country,
>> customer.c_login, customer.c_email_address, date_dim.d_year
>>                  Batches: 1  Memory Usage: 213017kB
>>                  ->  Hash Join  (cost=8151.60..103824.45 rows=2880270
>> width=174) (actual time=69.110..1686.608 rows=2685453 loops=1)
>>                        Hash Cond: (store_sales.ss_sold_date_sk =
>> date_dim.d_date_sk)
>>                        ->  Hash Join  (cost=5103.00..93214.72
>> rows=2880270 width=174) (actual time=49.517..1162.567 rows=2750652
>> loops=1)
>>                              Hash Cond: (store_sales.ss_customer_sk =
>> customer.c_customer_sk)
>>                              ->  Seq Scan on store_sales
>>  (cost=0.00..80550.70 rows=2880270 width=30) (actual
>> time=0.018..208.022 rows=2880404 loops=1)
>>                              ->  Hash  (cost=3853.00..3853.00
>> rows=100000 width=152) (actual time=49.271..49.271 rows=100000 loops=1)
>>                                    Buckets: 131072  Batches: 1
>>  Memory Usage: 17161kB
>>                                    ->  Seq Scan on customer
>>  (cost=0.00..3853.00 rows=100000 width=152) (actual
>> time=0.011..26.448 rows=100000 loops=1)
>>                        ->  Hash  (cost=2135.49..2135.49 rows=73049
>> width=8) (actual time=19.369..19.370 rows=73049 loops=1)
>>                              Buckets: 131072  Batches: 1  Memory
>> Usage: 3878kB
>>                              ->  Seq Scan on date_dim
>>  (cost=0.00..2135.49 rows=73049 width=8) (actual time=0.037..11.763
>> rows=73049 loops=1)
>>            ->  HashAggregate  (cost=114410.03..132428.63 rows=1441488
>> width=216) (actual time=2369.202..2447.868 rows=136978 loops=1)
>>                  Group Key: customer_1.c_customer_id,
>> customer_1.c_first_name, customer_1.c_last_name,
>> customer_1.c_preferred_cust_flag, customer_1.c_birth_country,
>> customer_1.c_login, customer_1.c_email_address, date_dim_1.d_year
>>                  Batches: 1  Memory Usage: 131097kB
>>                  ->  Hash Join  (cost=8151.60..67561.67 rows=1441488
>> width=177) (actual time=62.483..974.143 rows=1430939 loops=1)
>>                        Hash Cond: (catalog_sales.cs_sold_date_sk =
>> date_dim_1.d_date_sk)
>>                        ->  Hash Join  (cost=5103.00..60728.94
>> rows=1441488 width=177) (actual time=46.571..687.972 rows=1434519
>> loops=1)
>>                              Hash Cond:
>> (catalog_sales.cs_bill_customer_sk = customer_1.c_customer_sk)
>>                              ->  Seq Scan on catalog_sales
>>  (cost=0.00..51841.88 rows=1441488 width=33) (actual
>> time=0.029..128.238 rows=1441548 loops=1)
>>                              ->  Hash  (cost=3853.00..3853.00
>> rows=100000 width=152) (actual time=46.311..46.325 rows=100000 loops=1)
>>                                    Buckets: 131072  Batches: 1
>>  Memory Usage: 17161kB
>>                                    ->  Seq Scan on customer
>> customer_1  (cost=0.00..3853.00 rows=100000 width=152) (actual
>> time=0.005..23.350 rows=100000 loops=1)
>>                        ->  Hash  (cost=2135.49..2135.49 rows=73049
>> width=8) (actual time=15.677..15.677 rows=73049 loops=1)
>>                              Buckets: 131072  Batches: 1  Memory
>> Usage: 3878kB
>>                              ->  Seq Scan on date_dim date_dim_1
>>  (cost=0.00..2135.49 rows=73049 width=8) (actual time=0.015..7.957
>> rows=73049 loops=1)
>>            ->  HashAggregate  (cost=61277.38..70269.68 rows=719384
>> width=216) (actual time=1166.953..1198.730 rows=56649 loops=1)
>>                  Group Key: customer_2.c_customer_id,
>> customer_2.c_first_name, customer_2.c_last_name,
>> customer_2.c_preferred_cust_flag, customer_2.c_birth_country,
>> customer_2.c_login, customer_2.c_email_address, date_dim_2.d_year
>>                  Batches: 1  Memory Usage: 57369kB
>>                  ->  Hash Join  (cost=8151.60..37897.40 rows=719384
>> width=177) (actual time=68.327..508.594 rows=719119 loops=1)
>>                        Hash Cond: (web_sales.ws_sold_date_sk =
>> date_dim_2.d_date_sk)
>>                        ->  Hash Join  (cost=5103.00..32960.30
>> rows=719384 width=177) (actual time=52.240..357.963 rows=719217 loops=1)
>>                              Hash Cond:
>> (web_sales.ws_bill_customer_sk = customer_2.c_customer_sk)
>>                              ->  Seq Scan on web_sales
>>  (cost=0.00..25968.84 rows=719384 width=33) (actual
>> time=0.032..62.464 rows=719384 loops=1)
>>                              ->  Hash  (cost=3853.00..3853.00
>> rows=100000 width=152) (actual time=51.959..51.960 rows=100000 loops=1)
>>                                    Buckets: 131072  Batches: 1
>>  Memory Usage: 17161kB
>>                                    ->  Seq Scan on customer
>> customer_2  (cost=0.00..3853.00 rows=100000 width=152) (actual
>> time=0.004..25.350 rows=100000 loops=1)
>>                        ->  Hash  (cost=2135.49..2135.49 rows=73049
>> width=8) (actual time=15.831..15.834 rows=73049 loops=1)
>>                              Buckets: 131072  Batches: 1  Memory
>> Usage: 3878kB
>>                              ->  Seq Scan on date_dim date_dim_2
>>  (cost=0.00..2135.49 rows=73049 width=8) (actual time=0.014..8.100
>> rows=73049 loops=1)
>>    ->  Sort  (cost=794037.94..794037.95 rows=1 width=132) (actual
>> time=3024403.310..3024403.313 rows=8 loops=1)
>>          Sort Key: t_s_secyear.customer_id,
>> t_s_secyear.customer_first_name, t_s_secyear.customer_last_name,
>> t_s_secyear.customer_email_address
>>          Sort Method: quicksort  Memory: 26kB
>>          ->  Nested Loop  (cost=0.00..794037.93 rows=1 width=132)
>> (actual time=354851.431..3024403.218 rows=8 loops=1)
>>                Join Filter: ((t_s_secyear.customer_id =
>> t_w_secyear.customer_id) AND (CASE WHEN (t_c_firstyear.year_total >
>> '0'::numeric) THEN (t_c_secyear.year_total /
>> t_c_firstyear.year_total) ELSE NULL::numeric END > CASE WHEN
>> (t_w_firstyear.year_total > '0'::numeric) THEN
>> (t_w_secyear.year_total / t_w_firstyear.year_total) ELSE
>> NULL::numeric END))
>>                Rows Removed by Join Filter: 810136
>>                ->  Nested Loop  (cost=0.00..668006.23 rows=1
>> width=308) (actual time=33554.075..3021248.646 rows=72 loops=1)
>>                      Join Filter: ((t_s_secyear.customer_id =
>> t_c_secyear.customer_id) AND (CASE WHEN (t_c_firstyear.year_total >
>> '0'::numeric) THEN (t_c_secyear.year_total /
>> t_c_firstyear.year_total) ELSE NULL::numeric END > CASE WHEN
>> (t_s_firstyear.year_total > '0'::numeric) THEN
>> (t_s_secyear.year_total / t_s_firstyear.year_total) ELSE
>> NULL::numeric END))
>>                      Rows Removed by Join Filter: 11876277
>> * ->  Nested Loop  (cost=0.00..541974.53 rows=1 width=320) (actual
>> time=14866.104..3001271.961 rows=437 loops=1)*
>> *                           Join Filter: (t_s_firstyear.customer_id =
>> t_s_secyear.customer_id)*
>> *                           Rows Removed by Join Filter: 44702488*
>> *                           ->  Nested Loop  (cost=0.00..415941.57
>> rows=2 width=156) (actual time=11739.944..2946020.749 rows=1171 loops=1)*
>> *                                 Join Filter:
>> (t_s_firstyear.customer_id = t_w_firstyear.customer_id)*
>> *                                 Rows Removed by Join Filter: 112695277*
>> *                                 ->  Nested Loop
>>  (cost=0.00..277302.08 rows=9 width=104) (actual
>> time=8139.729..2351733.795 rows=9952 loops=1)*
>> *                                       Join Filter:
>> (t_s_firstyear.customer_id = t_c_firstyear.customer_id)*
>> *                                       Rows Removed by Join Filter:
>> 997895870*
>> *                                       ->  CTE Scan on year_total
>> t_s_firstyear  (cost=0.00..138631.41 rows=42 width=52) (actual
>> time=4126.046..4234.598 rows=37923 loops=1)*
>> *                                             Filter: ((year_total >
>> '0'::numeric) AND (sale_type = 's'::text) AND (dyear = 2001))*
>> *                                             Rows Removed by Filter:
>> 346285*
>> *                                       ->  CTE Scan on year_total
>> t_c_firstyear  (cost=0.00..138631.41 rows=42 width=52) (actual
>> time=28.926..60.356 rows=26314 loops=37923)*
>> *                                             Filter: ((year_total >
>> '0'::numeric) AND (sale_type = 'c'::text) AND (dyear = 2001))*
>> *                                             Rows Removed by Filter:
>> 357894*
>> *                                 ->  CTE Scan on year_total
>> t_w_firstyear  (cost=0.00..138631.41 rows=42 width=52) (actual
>> time=49.572..59.057 rows=11324 loops=9952)*
>> *                                       Filter: ((year_total >
>> '0'::numeric) AND (sale_type = 'w'::text) AND (dyear = 2001))*
>> *                                       Rows Removed by Filter: 372884*
>> *                           ->  CTE Scan on year_total t_s_secyear
>>  (cost=0.00..126028.55 rows=126 width=164) (actual time=0.002..44.949
>> rows=38175 loops=1171)*
>> *                                 Filter: ((sale_type = 's'::text)
>> AND (dyear = 2002))*
>> *                                 Rows Removed by Filter: 346033*
>> *                     ->  CTE Scan on year_total t_c_secyear
>>  (cost=0.00..126028.55 rows=126 width=52) (actual time=21.023..44.097
>> rows=27177 loops=437)*
>> *                           Filter: ((sale_type = 'c'::text) AND
>> (dyear = 2002))*
>> *                           Rows Removed by Filter: 357031*
>> *               ->  CTE Scan on year_total t_w_secyear
>>  (cost=0.00..126028.55 rows=126 width=52) (actual time=36.137..43.090
>> rows=11252 loops=72)*
>> *                     Filter: ((sale_type = 'w'::text) AND (dyear =
>> 2002))*
>> *                     Rows Removed by Filter: 372956*
>>  Planning Time: 4.529 ms
>>  Execution Time: 3024486.695 ms
>> (83 rows)
>>
>>
>>
> I have noticed significant underestimation here. In addition, these
> nodes are executed noticeably slower in the plan, you can see it by
> the current time in the explain.
Sorry, I meant actual time, not current time.

--
Regards,
Alena Rybakina
Postgres Professional

In response to

Browse pgsql-performance by date

  From Date Subject
Next Message Pavel Stehule 2024-11-10 22:41:31 Re: proposal: schema variables
Previous Message Alena Rybakina 2024-11-10 20:16:48 Re: Performance of Query 4 on TPC-DS Benchmark