Sunday, February 18, 2018

Complex data process in SQL vs Pig


[donghua@cdh-vm temp]$ hcat -e "desc employees.departments"
dept_no             string                                  
dept_name           string                                  

[donghua@cdh-vm temp]$ hcat -e "desc employees.dept_manager"
emp_no               int                                    
dept_no             string                                  
from_date           string                                  
to_date             string                                  


[donghua@cdh-vm temp]$ hcat -e "desc employees.dept_emp"
emp_no               int                                    
dept_no             string                                  
from_date           string                                  
to_date             string                                  

[donghua@cdh-vm temp]$ hcat -e "desc employees.employees"
emp_no               int                                    
birth_date           string                                  
first_name           string                                  
last_name           string                                  
gender               string                                  
hire_date           string                                  


-- Find out their manager name & department size
select d.dept_name, concat(m.first_name,' ',m.last_name) as manager, count(e.emp_no) employees
from employees.departments d 
join employees.dept_manager dm on d.dept_no = dm.dept_no
join employees.employees m on dm.emp_no = m.emp_no
join employees.dept_emp de on d.dept_no = de.dept_no
join employees.employees e on de.emp_no = e.emp_no
where de.to_date >'2018-01-01'
and dm.to_date > '2018-01-01'
group by d.dept_name, concat(m.first_name,' ',m.last_name)
order by d.dept_name;

Run SQL in Hive:

Connecting to jdbc:hive2://cdh-vm.dbaglobe.com:10000/employees
Connected to: Apache Hive (version 1.1.0-cdh5.14.0)
Driver: Hive JDBC (version 1.1.0-cdh5.14.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 1.1.0-cdh5.14.0 by Apache Hive
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/emp> -- Find out their manager name & department size
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/emp> select d.dept_name, concat(m.first_name,' ',m.last_name) as manager, count(e.emp_no) employees
. . . . . . . . . . . . . . . . . . . . . . .> from employees.departments d 
. . . . . . . . . . . . . . . . . . . . . . .> join employees.dept_manager dm on d.dept_no = dm.dept_no
. . . . . . . . . . . . . . . . . . . . . . .> join employees.employees m on dm.emp_no = m.emp_no
. . . . . . . . . . . . . . . . . . . . . . .> join employees.dept_emp de on d.dept_no = de.dept_no
. . . . . . . . . . . . . . . . . . . . . . .> join employees.employees e on de.emp_no = e.emp_no
. . . . . . . . . . . . . . . . . . . . . . .> where de.to_date >'2018-01-01'
. . . . . . . . . . . . . . . . . . . . . . .> and dm.to_date > '2018-01-01'
. . . . . . . . . . . . . . . . . . . . . . .> group by d.dept_name, concat(m.first_name,' ',m.last_name)
. . . . . . . . . . . . . . . . . . . . . . .> order by d.dept_name;

+---------------------+--------------------+------------+--+
|     d.dept_name     |      manager       | employees  |
+---------------------+--------------------+------------+--+
| Customer Service    | Yuchang Weedman    | 17569      |
| Development         | Leon DasSarma      | 61386      |
| Finance             | Isamu Legleitner   | 12437      |
| Human Resources     | Karsten Sigstam    | 12898      |
| Marketing           | Vishwani Minakawa  | 14842      |
| Production          | Oscar Ghazalie     | 53304      |
| Quality Management  | Dung Pesch         | 14546      |
| Research            | Hilary Kambil      | 15441      |
| Sales               | Hauke Zhang        | 37701      |
+---------------------+--------------------+------------+--+
9 rows selected (100.528 seconds)

Run SQL in Impala:

Connected to cdh-vm.dbaglobe.com:21000
Server version: impalad version 2.11.0-cdh5.14.0 RELEASE (build d68206561bce6b26762d62c01a78e6cd27aa7690)
***********************************************************************************
Welcome to the Impala shell.
(Impala Shell v2.11.0-cdh5.14.0 (d682065) built on Sat Jan  6 13:27:16 PST 2018)

Press TAB twice to see a list of available commands.
***********************************************************************************
[cdh-vm.dbaglobe.com:21000] > -- Find out their manager name & department size
                            > select d.dept_name, concat(m.first_name,' ',m.last_name) as manager, count(e.emp_no) employees
                            > from employees.departments d 
                            > join employees.dept_manager dm on d.dept_no = dm.dept_no
                            > join employees.employees m on dm.emp_no = m.emp_no
                            > join employees.dept_emp de on d.dept_no = de.dept_no
                            > join employees.employees e on de.emp_no = e.emp_no
                            > where de.to_date >'2018-01-01'
                            > and dm.to_date > '2018-01-01'
                            > group by d.dept_name, concat(m.first_name,' ',m.last_name)
                            > order by d.dept_name;
Query: -- Find out their manager name & department size
select d.dept_name, concat(m.first_name,' ',m.last_name) as manager, count(e.emp_no) employees
from employees.departments d
join employees.dept_manager dm on d.dept_no = dm.dept_no
join employees.employees m on dm.emp_no = m.emp_no
join employees.dept_emp de on d.dept_no = de.dept_no
join employees.employees e on de.emp_no = e.emp_no
where de.to_date >'2018-01-01'
and dm.to_date > '2018-01-01'
group by d.dept_name, concat(m.first_name,' ',m.last_name)
order by d.dept_name
Query submitted at: 2018-02-18 20:58:51 (Coordinator: http://cdh-vm.dbaglobe.com:25000)
Query progress can be monitored at: http://cdh-vm.dbaglobe.com:25000/query_plan?query_id=a04e8317637c0e4a:a83017f00000000
+--------------------+-------------------+-----------+
| dept_name          | manager           | employees |
+--------------------+-------------------+-----------+
| Customer Service   | Yuchang Weedman   | 17569     |
| Development        | Leon DasSarma     | 61386     |
| Finance            | Isamu Legleitner  | 12437     |
| Human Resources    | Karsten Sigstam   | 12898     |
| Marketing          | Vishwani Minakawa | 14842     |
| Production         | Oscar Ghazalie    | 53304     |
| Quality Management | Dung Pesch        | 14546     |
| Research           | Hilary Kambil     | 15441     |
| Sales              | Hauke Zhang       | 37701     |
+--------------------+-------------------+-----------+
Fetched 9 row(s) in 19.43s

Run SQL in Mysql:
  
Server version: 5.5.56-MariaDB MariaDB Server

Copyright (c) 2000, 2017, Oracle, MariaDB Corporation Ab and others.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

MariaDB [employees]> -- Find out their manager name & department size
MariaDB [employees]> select d.dept_name, concat(m.first_name,' ',m.last_name) as manager, count(e.emp_no) employees
    -> from employees.departments d 
    -> join employees.dept_manager dm on d.dept_no = dm.dept_no
    -> join employees.employees m on dm.emp_no = m.emp_no
    -> join employees.dept_emp de on d.dept_no = de.dept_no
    -> join employees.employees e on de.emp_no = e.emp_no
    -> where de.to_date >'2018-01-01'
    -> and dm.to_date > '2018-01-01'
    -> group by d.dept_name, concat(m.first_name,' ',m.last_name)
    -> order by d.dept_name;
+--------------------+-------------------+-----------+
| dept_name          | manager           | employees |
+--------------------+-------------------+-----------+
| Customer Service   | Yuchang Weedman   |     17569 |
| Development        | Leon DasSarma     |     61386 |
| Finance            | Isamu Legleitner  |     12437 |
| Human Resources    | Karsten Sigstam   |     12898 |
| Marketing          | Vishwani Minakawa |     14842 |
| Production         | Oscar Ghazalie    |     53304 |
| Quality Management | Dung Pesch        |     14546 |
| Research           | Hilary Kambil     |     15441 |
| Sales              | Hauke Zhang       |     37701 |
+--------------------+-------------------+-----------+
9 rows in set (1.62 sec)

Re-write SQL in Pig-Latin:

-- pig script
-- Find out department size and their manager name
d0 = LOAD 'employees.departments' USING org.apache.hive.hcatalog.pig.HCatLoader();
dm0 = LOAD 'employees.dept_manager' USING org.apache.hive.hcatalog.pig.HCatLoader();
de0 = LOAD 'employees.dept_emp' USING org.apache.hive.hcatalog.pig.HCatLoader();
e0 = LOAD 'employees.employees' USING org.apache.hive.hcatalog.pig.HCatLoader();

d1 = FOREACH d0 GENERATE dept_no, dept_name;
dm1 = FOREACH (FILTER dm0 BY to_date >'2018-01-01') GENERATE dept_no, emp_no;
de1 = FOREACH (FILTER de0 by to_date >'2018-01-01') GENERATE dept_no, emp_no;
e1 = FOREACH e0 GENERATE emp_no, CONCAT(first_name,' ',last_name) AS fullname;

d1_dm1 = JOIN d1 BY dept_no, dm1 BY dept_no;
d1_dm1_e1 = JOIN d1_dm1 BY emp_no, e1 BY emp_no;

dept_mgr = FOREACH d1_dm1_e1 GENERATE d1_dm1::d1::dept_no AS dept_no, d1_dm1::d1::dept_name AS dept_name, e1::fullname AS manager;

d1_de1 = JOIN d1 BY dept_no, de1 BY dept_no;
d1_de1_e1 = FOREACH (JOIN d1_de1 BY emp_no, e1 BY emp_no) GENERATE d1_de1::d1::dept_no,e1::emp_no;
dept_emp_count = FOREACH (GROUP d1_de1_e1 BY dept_no) GENERATE group AS dept_no, COUNT(d1_de1_e1) AS employees;

dept_info_0 = JOIN dept_mgr BY dept_no, dept_emp_count BY dept_no;
dept_info_1 = FOREACH dept_info_0 GENERATE dept_mgr::dept_name AS dept_name, dept_mgr::manager AS manager, dept_emp_count::employees AS employees;
dept_info_2 = ORDER dept_info_1 BY dept_name;

DUMP dept_info_2;


[donghua@cdh-vm temp]$ date;pig -4 log4j.properties emp.pig;date;

Sun Feb 18 22:09:44 +08 2018

(Customer Service,Yuchang Weedman,17569)
(Development,Leon DasSarma,61386)
(Finance,Isamu Legleitner,12437)
(Human Resources,Karsten Sigstam,12898)
(Marketing,Vishwani Minakawa,14842)
(Production,Oscar Ghazalie,53304)
(Quality Management,Dung Pesch,14546)
(Research,Hilary Kambil,15441)
(Sales,Hauke Zhang,37701)

Sun Feb 18 22:16:08 +08 2018