Wednesday, March 7, 2018

Construct SCD Type 2 in Hive examples

create table products(prod_id int, name string, price double, last_modified timestamp) stored as parquet;
create table orders(order_id int, prod_id int, quantity int, order_date timestamp) stored as parquet;

insert into products values(1,'Soap',1.0,'2017-01-01');
insert into products values(1,'Soap',1.1,'2017-04-01');
insert into products values(1,'Soap',1.2,'2017-07-01');
insert into products values(1,'Soap',1.3,'2017-10-01');
insert into products values(2,'Soda',0.6,'2017-01-01');
insert into products values(2,'Soda',0.8,'2017-06-01');
insert into products values(3,'Beer',4.2,'2017-07-01');

insert into orders values(1,1,1,'2017-02-10');
insert into orders values(2,2,10,'2017-07-10');
insert into orders values(3,3,5,'2017-09-10');
insert into orders values(4,1,2,'2017-12-01');

create view prod_scd2 as 
select prod_id,price,last_modified as valid_from,
lead(last_modified) over (partition by prod_id order by last_modified) as valid_end
from products;

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000> select * from prod_scd2 p;
+------------+----------+------------------------+------------------------+--+
| p.prod_id  | p.price  |      p.valid_from      |      p.valid_end       |
+------------+----------+------------------------+------------------------+--+
| 1          | 1.0      | 2017-01-01 00:00:00.0  | 2017-04-01 00:00:00.0  |
| 1          | 1.1      | 2017-04-01 00:00:00.0  | 2017-07-01 00:00:00.0  |
| 1          | 1.2      | 2017-07-01 00:00:00.0  | 2017-10-01 00:00:00.0  |
| 1          | 1.3      | 2017-10-01 00:00:00.0  | NULL                   |
| 2          | 0.6      | 2017-01-01 00:00:00.0  | 2017-06-01 00:00:00.0  |
| 2          | 0.8      | 2017-06-01 00:00:00.0  | NULL                   |
| 3          | 4.2      | 2017-07-01 00:00:00.0  | NULL                   |
+------------+----------+------------------------+------------------------+--+
7 rows selected (20.253 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000> select * from orders o;
+-------------+------------+-------------+------------------------+--+
| o.order_id  | o.prod_id  | o.quantity  |      o.order_date      |
+-------------+------------+-------------+------------------------+--+
| 1           | 1          | 1           | 2017-02-10 00:00:00.0  |
| 2           | 2          | 10          | 2017-07-10 00:00:00.0  |
| 3           | 3          | 5           | 2017-09-10 00:00:00.0  |
| 4           | 1          | 2           | 2017-12-01 00:00:00.0  |
+-------------+------------+-------------+------------------------+--+
4 rows selected (0.152 seconds)

select o.order_id,o.order_date,p.price,o.quantity*p.price as total_cost
from orders o join prod_scd2 p
on (o.prod_id=p.prod_id) 
where o.order_date between p.valid_from and if(p.valid_end is not null ,p.valid_end,cast('9999-12-31' as timestamp))
order by order_id;

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000> select o.order_id,o.order_date,p.price,o.quantity*p.price as total_cost
. . . . . . . . . . . . . . . . . . . . .> from orders o join prod_scd2 p
. . . . . . . . . . . . . . . . . . . . .> on (o.prod_id=p.prod_id) 
. . . . . . . . . . . . . . . . . . . . .> where o.order_date between p.valid_from and if(p.valid_end is not null ,p.valid_end,cast('9999-12-31' as timestamp))
. . . . . . . . . . . . . . . . . . . . .> order by order_id;

+-------------+------------------------+----------+-------------+--+
| o.order_id  |      o.order_date      | p.price  | total_cost  |
+-------------+------------------------+----------+-------------+--+
| 1           | 2017-02-10 00:00:00.0  | 1.0      | 1.0         |
| 2           | 2017-07-10 00:00:00.0  | 0.8      | 8.0         |
| 3           | 2017-09-10 00:00:00.0  | 4.2      | 21.0        |
| 4           | 2017-12-01 00:00:00.0  | 1.3      | 2.6         |
+-------------+------------------------+----------+-------------+--+
4 rows selected (48.862 seconds)