Tuesday, December 26, 2017

Hive Bucketing with examples

Think it as HASH based indexes in RDBMS, more suitable for high cardinanity data columns (e.g.: customer_id, product_id, station_id, etc)

Basic Bucket example:


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet6
. . . . . . . . . . . . . . . . . . . . . . .> (month char(7),fleet smallint,company varchar(50))
. . . . . . . . . . . . . . . . . . . . . . .> clustered by (company) into 3 buckets
. . . . . . . . . . . . . . . . . . . . . . .> stored as avro;

Example using Apache Hive version 1.1.0-cdh5.13.1, hive.enforce.bucketing=false by default
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet6
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company from monthly_taxi_fleet;

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet6
-rwxrwxrwt   1 donghua hive      25483 2017-12-26 10:40 /user/hive/warehouse/monthly_taxi_fleet6/000000_0

-- hive.enforce.bucketing: Whether bucketing is enforced. If true, while inserting into the table, bucketing is enforced.
-- Default Value: Hive 0.x: false, Hive 1.x: false, Hive 2.x: removed, which effectively makes it always true (HIVE-12331)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> set hive.enforce.bucketing=true;

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet6
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company from monthly_taxi_fleet;

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet6
-rwxrwxrwt   1 donghua hive      13611 2017-12-26 10:43 /user/hive/warehouse/monthly_taxi_fleet6/000000_0
-rwxrwxrwt   1 donghua hive       6077 2017-12-26 10:43 /user/hive/warehouse/monthly_taxi_fleet6/000001_0
-rwxrwxrwt   1 donghua hive       6589 2017-12-26 10:43 /user/hive/warehouse/monthly_taxi_fleet6/000002_0

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> describe extended monthly_taxi_fleet6;
+-----------------------------+----------------------------------------------------+----------+--+
|          col_name           |                     data_type                      | comment  |
+-----------------------------+----------------------------------------------------+----------+--+
| month                       | char(7)                                            |          |
| fleet                       | int                                                |          |
| company                     | varchar(50)                                        |          |
|                             | NULL                                               | NULL     |
| Detailed Table Information  | Table(tableName:monthly_taxi_fleet6, dbName:default, owner:donghua, createTime:1514256031, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:month, type:char(7), comment:null), FieldSchema(name:fleet, type:smallint, comment:null), FieldSchema(name:company, type:varchar(50), comment:null)], location:hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet6, inputFormat:org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat, compressed:false, numBuckets:3, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.avro.AvroSerDe, parameters:{serialization.format=1}), bucketCols:[company], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), partitionKeys:[], parameters:{totalSize=26277, numRows=1128, rawDataSize=0, COLUMN_STATS_ACCURATE=true, numFiles=3, transient_lastDdlTime=1514256192}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE) |          |
+-----------------------------+----------------------------------------------------+----------+--+
5 rows selected (0.075 seconds)

Advanced Bucket example: Partition + Bucketing + Sorted by


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create table monthly_taxi_fleet7
. . . . . . . . . . . . . . . . . . . . . . .> (month char(7),fleet smallint)
. . . . . . . . . . . . . . . . . . . . . . .> partitioned by (company varchar(50))
. . . . . . . . . . . . . . . . . . . . . . .> clustered by (month) sorted by (month)into 3 buckets
. . . . . . . . . . . . . . . . . . . . . . .> stored as avro;

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> insert into monthly_taxi_fleet7
. . . . . . . . . . . . . . . . . . . . . . .> partition (company)
. . . . . . . . . . . . . . . . . . . . . . .> select month,fleet,company from monthly_taxi_fleet;

[donghua@cdh-vm ~]$ hdfs dfs -ls -R /user/hive/warehouse/monthly_taxi_fleet7
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=CityCab/000002_0

drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort
-rwxrwxrwt   1 donghua hive        913 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort/000000_0
-rwxrwxrwt   1 donghua hive        913 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort/000001_0
-rwxrwxrwt   1 donghua hive        913 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Comfort/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Individual Yellow- Top/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Premier/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime
-rwxrwxrwt   1 donghua hive        765 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime/000000_0
-rwxrwxrwt   1 donghua hive        765 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime/000001_0
-rwxrwxrwt   1 donghua hive        766 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Prime/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=SMRT/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart
-rwxrwxrwt   1 donghua hive        720 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart/000000_0
-rwxrwxrwt   1 donghua hive        719 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart/000001_0
-rwxrwxrwt   1 donghua hive        719 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=Smart/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab/000000_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab/000001_0
-rwxrwxrwt   1 donghua hive        865 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=TransCab/000002_0
drwxrwxrwt   - donghua hive          0 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC
-rwxrwxrwt   1 donghua hive        432 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC/000000_0
-rwxrwxrwt   1 donghua hive        432 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC/000001_0
-rwxrwxrwt   1 donghua hive        432 2017-12-26 11:05 /user/hive/warehouse/monthly_taxi_fleet7/company=YTC/000002_0