Saturday, February 17, 2018

Learning Apache Pig Chap 2 (Oreilly)

Sample data files:
https://resources.oreilly.com/examples/0636920047704/blob/master/Learning%20Apache%20Pig%20-%20Working%20Files/Chapter%202/cities_small.txt
https://resources.oreilly.com/examples/0636920047704/blob/master/Learning%20Apache%20Pig%20-%20Working%20Files/Chapter%202/states.txt



[donghua@cdh-vm temp]$ pig -4 log4j.properties 
grunt> cities = load 'cities_small.txt' as (name:chararray,state:chararray,pop:int);
grunt> aliases;
grunt> describe cities
cities: {name: chararray,state: chararray,pop: int}
grunt> \de cities
cities: {name: chararray,state: chararray,pop: int}
grunt> ca_cities = filter cities by (state=='CA');
grunt> dump ca_cities;
grunt> \d ca_cities
grunt> illustrate;
(South Gate,CA,96640)
--------------------------------------------------------------------
| cities     | name:chararray    | state:chararray    | pop:int    | 
--------------------------------------------------------------------
|            | South Gate        | CA                 | 96640      | 
--------------------------------------------------------------------

grunt> illustrate;
(Fresno,CA,476050)
--------------------------------------------------------------------
| cities     | name:chararray    | state:chararray    | pop:int    | 
--------------------------------------------------------------------
|            | Fresno            | CA                 | 476050     | 
--------------------------------------------------------------------

grunt> ordered_cities = order cities by pop desc;

grunt> states = load 'states.txt' as (rank:int,code:chararray,fullname:chararray,date_entered:chararray,year_entered:int);

grunt> cities_join_states = join cities by state, states by code;

grunt> illustrate cities_join_states;
(Fargo,ND,93531)
(39,ND,North Dakota,02-NOV,1889)
--------------------------------------------------------------------
| cities     | name:chararray    | state:chararray    | pop:int    | 
--------------------------------------------------------------------
|            | Fargo             | ND                 | 93531      | 
|            | Fargo             | ND                 | 93531      | 
--------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------
| states     | rank:int    | code:chararray    | fullname:chararray    | date_entered:chararray    | year_entered:int    | 
--------------------------------------------------------------------------------------------------------------------------
|            | 39          | ND                | North Dakota          | 02-NOV                    | 1889                | 
|            | 39          | ND                | North Dakota          | 02-NOV                    | 1889                | 
--------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| cities_join_states     | cities::name:chararray    | cities::state:chararray    | cities::pop:int    | states::rank:int    | states::code:chararray    | states::fullname:chararray    | states::date_entered:chararray    | states::year_entered:int    | 
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|                        | Fargo                     | ND                         | 93531              | 39                  | ND                        | North Dakota                  | 02-NOV                            | 1889                        | 
|                        | Fargo                     | ND                         | 93531              | 39                  | ND                        | North Dakota                  | 02-NOV                            | 1889                        | 
|                        | Fargo                     | ND                         | 93531              | 39                  | ND                        | North Dakota                  | 02-NOV                            | 1889                        | 
|                        | Fargo                     | ND                         | 93531              | 39                  | ND                        | North Dakota                  | 02-NOV                            | 1889                        | 
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

grunt> cities_join_states_short = foreach cities_join_states generate cities::name, states::fullname;

grunt> store cities_join_states_short into 'cities_join_states_short';
grunt> fs -ls cities_join_states_short
grunt> fs -cat cities_join_states_short/part-r-00000



grunt> cities_join_states_short = foreach (join cities by state, states by code) generate cities::name, states::fullname;
grunt> city_and_state = foreach cities generate name,state,pop*1.5;

grunt> cities_by_state = group cities by state;
grunt> \de cities_by_state;
cities_by_state: {group: chararray,cities: {(name: chararray,state: chararray,pop: int)}}
grunt> illustrate cities_by_state;
(Sioux Falls,SD,154997)
-----------------------------------------------------------------------
| cities     | name:chararray     | state:chararray     | pop:int     | 
-----------------------------------------------------------------------
|            | Sioux Falls        | SD                  | 154997      | 
|            | Rapid City         | SD                  | 65491       | 
-----------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------
| cities_by_state     | group:chararray     | cities:bag{:tuple(name:chararray,state:chararray,pop:int)}                     | 
------------------------------------------------------------------------------------------------------------------------------
|                     | SD                  | {(Sioux Falls, SD, 154997), (Rapid City, SD, 65491)}                           | 
------------------------------------------------------------------------------------------------------------------------------

grunt> total_cities = foreach (group cities all) generate COUNT(cities);
grunt> \d total_cities;
(500)

grunt> cities_by_state = foreach (group cities by state) generate group, COUNT(cities);
grunt> \d cities_by_state;

grunt> cities_by_state = foreach (group cities by state parallel 3) generate group, COUNT(cities);
grunt> store cities_by_state into 'cities_by_state';
grunt> fs -ls cities_by_state
Found 4 items
-rw-r--r--   1 donghua supergroup          0 2018-02-17 22:25 cities_by_state/_SUCCESS
-rw-r--r--   1 donghua supergroup        113 2018-02-17 22:25 cities_by_state/part-r-00000
-rw-r--r--   1 donghua supergroup         82 2018-02-17 22:25 cities_by_state/part-r-00001
-rw-r--r--   1 donghua supergroup         86 2018-02-17 22:25 cities_by_state/part-r-00002