문제1 :: Bigdata

문제1
VERTICA/06. Tuning 2017. 2. 13. 11:35

-- 조인결과 row수 대략 7~8천억건이 적당 (약5~7 년전 pc 급 3대 클러스터 기준)
SELECT count(*)
FROM online_sales.ytkim_target AS TARGET INNER JOIN online_sales.ytkim_target AS adjoin
ON (adjoin.product_key = TARGET.product_key AND adjoin.customer_key <> TARGET.customer_key)

-- 아래의 SQL을 개선하는 방법을 찾는 문제 입니다.
(그룹함수의 결과는 큰의미가 없습니다.
adjoin.customer_key <> TARGET.customer_key 조건절 때문에 발생하는 비효율을 없에는 것이 포인트 입니다.)

SELECT TARGET.customer_key AS target_customer_key
, adjoin.customer_key AS adjoin_customer_key
, count(*)
, sum(TARGET.sales_dollar_amount)
, sum(sqrt(TARGET.gross_profit_dollar_amount))
FROM online_sales.ytkim_target AS TARGET INNER JOIN online_sales.ytkim_target AS adjoin
ON (adjoin.product_key = TARGET.product_key AND adjoin.customer_key <> TARGET.customer_key)
GROUP BY TARGET.customer_key, adjoin.customer_key

--기본제공하는 vertica vmart 에서 데이터 rows만 변경하여 samfile 추출

판매이력 테이블을 쓸예정인데
기본데이터는 너무 적으므로 초기데이터 4000만건으로 시작

실행방법은 바로 아래에...

cd /opt/vertica/examples/VMart_Schema

./vmart_gen --customer_dimension 20000000 --product_dimension 300000 --online_sales_fact 40000000 --store_dimension 20000 --store_sales_fact 40000000 --years 2015-2016

[ --datadirectory <dir-data-files> ]
[ --files <fact-table-load-files> ]
[ --null <character-to-indicate-null-value> ]
[ --seed <for-random-number-generator> ]
[ --time_file <data-for-Date_Dimension> ]
[ --store_sales_fact <rows> ]
[ --store_orders_fact <rows> ]
[ --product_dimension <rows> ]
[ --store_dimension <rows> ]
[ --promotion_dimension <rows> ]
[ --vendor_dimension <rows> ]
[ --customer_dimension <rows> ]
[ --employee_dimension <rows> ]
[ --warehouse_dimension <rows> ]
[ --shipping_dimension <rows> ]
[ --online_page_dimension <rows> ]
[ --callcenter_dimension <rows> ]
[ --online_sales_fact <rows> ]
[ --inventory_fact <rows> ]
[ --gen_load_script ]
[ --years startYear-endYear ]

-- 대상 테이블 생성

vsql 상에서 아래구문수행

\i vmart_schema_drop.sql --스키마삭제
\i vmart_define_schema.sql --스키마생성
\i vmart_load_data.sql    --데이터적재

-- 원본은 훼손하지말고 online_sales.online_sales_fact 복제하여 test합시다.(재작업시 원본이 있으니 다시 할 수 있음)

drop table online_sales.ytkim_target cascade;
create table online_sales.ytkim_target as select * from online_sales.online_sales_fact ;

-- 랜덤하게 만들어진 데이터이므로 그중 자주사는사람과 자주팔리는 상품 데이터를 추가생성
-- 자주사는사람 약 100000명 정도, 많이 팔리는 상품 약 100개 정도를 의도함
create TEMPORARY table yt1 on COMMIT PRESERVE rows AS
select * from online_sales.ytkim_target limit 1

create TEMPORARY table yt2 on commit PRESERVE rows as
select *
from
(
select * from (
select distinct customer_key from online_sales.ytkim_target a
where a.customer_key < 100000
) customer cross join
(
select distinct product_key from online_sales.ytkim_target a
where a.product_key < 100
) product
) a

insert /*+ direct */ into online_sales.ytkim_target
select
sale_date_key,
ship_date_key,
yt2.product_key,
product_version,
yt2.customer_key,
call_center_key,
online_page_key,
shipping_key,
warehouse_key,
promotion_key,
pos_transaction_number,
sales_quantity,
sales_dollar_amount,
ship_dollar_amount,
net_dollar_amount,
cost_dollar_amount,
gross_profit_dollar_amount,
transaction_type
from yt2 cross join yt1

commit;

-- 조인결과 row수 대략 7~8천억건이 적당 (약5~7 년전 pc 3대 클러스터 기준)
SELECT count(*)
FROM online_sales.ytkim_target AS TARGET INNER JOIN online_sales.ytkim_target AS adjoin
ON (adjoin.product_key = TARGET.product_key AND adjoin.customer_key <> TARGET.customer_key)

-- 자 데이터는 준비가 되었으니 해당sql에 대한 성능개선을 해주세요.(모든방법을 동원하여~~)

SELECT TARGET.customer_key AS target_customer_key
, adjoin.customer_key AS adjoin_customer_key
, count(*)
, sum(TARGET.sales_dollar_amount)
, sum(sqrt(TARGET.gross_profit_dollar_amount))
FROM online_sales.ytkim_target AS TARGET INNER JOIN online_sales.ytkim_target AS adjoin
ON (adjoin.product_key = TARGET.product_key AND adjoin.customer_key <> TARGET.customer_key)
GROUP BY TARGET.customer_key, adjoin.customer_key

8.0.1에서 TEST 하였습니다

'VERTICA > 06. Tuning' 카테고리의 다른 글

버티카 성능 진단  (0) 2016.12.14

Explain 과 Profile  (0) 2015.08.06
관련글 관련글 더보기
- 버티카 성능 진단2016.12.14
- Explain 과 Profile2015.08.06
댓글

인기포스트 MORE POST

ABOUT ME

Bigdata Bigdata

'VERTICA > 06. Tuning' 카테고리의 다른 글

티스토리툴바

버티카 성능 진단 (0)	2016.12.14
Explain 과 Profile (0)	2015.08.06

인기포스트 MORE POST

ABOUT ME

'VERTICA > 06. Tuning' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바