Monday, December 25, 2017

Hive Bitmap Indexes with example

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> show create table monthly_taxi_fleet5;
createtab_stmt
CREATE TABLE `monthly_taxi_fleet5`(
  `month` char(7),
  `company` varchar(50),
  `fleet` smallint)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5'
TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='false',
  'last_modified_by'='donghua',
  'last_modified_time'='1514073485',
  'numFiles'='1',
  'numRows'='-1',
  'rawDataSize'='-1',
  'totalSize'='628',
  'transient_lastDdlTime'='1514073485')
21 rows selected (0.069 seconds)


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> !outputformat csv2
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> explain select * from monthly_taxi_fleet5 where company='Comfort';

Explain
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
           TableScan
            alias: monthly_taxi_fleet5
             filterExpr: (company = 'Comfort') (type: boolean)
             Statistics: Num rows: 10 Data size: 628 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: (company = 'Comfort') (type: boolean)
              Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                expressions: month (type: char(7)), company (type: varchar(50)), fleet (type: smallint)
                outputColumnNames: _col0, _col1, _col2
                Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                File Output Operator
                  compressed: false
                   Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                  table:
                       input format: org.apache.hadoop.mapred.TextInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

33 rows selected (0.183 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create index monthly_taxi_fleet5_company on table monthly_taxi_fleet5(company) as 'bitmap';
ERROR : FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: Please specify deferred rebuild using " WITH DEFERRED REBUILD ".
Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: Please specify deferred rebuild using " WITH DEFERRED REBUILD ". (state=08S01,code=1)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create index monthly_taxi_fleet5_company on table monthly_taxi_fleet5(company) as 'bitmap' with deferred rebuild;


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> alter index monthly_taxi_fleet5_company on monthly_taxi_fleet5 rebuild;
No rows affected (21.826 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> explain select * from monthly_taxi_fleet5 where company='Comfort';

Explain
STAGE DEPENDENCIES:
  Stage-3 is a root stage
  Stage-2 depends on stages: Stage-3
  Stage-1 depends on stages: Stage-2
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-3
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: default__monthly_taxi_fleet5_monthly_taxi_fleet5_company__
            filterExpr: ((company = 'Comfort') and (not EWAH_BITMAP_EMPTY(_bitmaps))) (type: boolean)
             Filter Operator
              predicate: ((company = 'Comfort') and (not EWAH_BITMAP_EMPTY(_bitmaps))) (type: boolean)
               Select Operator
                expressions: _bucketname (type: string), _offset (type: bigint)
                outputColumnNames: _col0, _col1
                Group By Operator
                   aggregations: collect_set(_col1)
                  keys: _col0 (type: string)
                  mode: hash
                   outputColumnNames: _col0, _col1
                  Reduce Output Operator
                    key expressions: _col0 (type: string)
                    sort order: +
                     Map-reduce partition columns: _col0 (type: string)
                     value expressions: _col1 (type: array<bigint>)
      Reduce Operator Tree:
        Group By Operator
          aggregations: collect_set(VALUE._col0)
          keys: KEY._col0 (type: string)
          mode: mergepartial
          outputColumnNames: _col0, _col1
          File Output Operator
            compressed: false
            table:
                input format: org.apache.hadoop.mapred.TextInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-2
    Move Operator
      files:
           hdfs directory: true
          destination: hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/6086c443-dac8-4517-8118-a37d42ce56a4/hive_2017-12-25_19-08-59_026_6029178632909749720-6/-mr-10003

  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
           TableScan
            alias: monthly_taxi_fleet5
             filterExpr: (company = 'Comfort') (type: boolean)
             Statistics: Num rows: 10 Data size: 628 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: (company = 'Comfort') (type: boolean)
              Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                expressions: month (type: char(7)), company (type: varchar(50)), fleet (type: smallint)
                outputColumnNames: _col0, _col1, _col2
                Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                File Output Operator
                  compressed: false
                   Statistics: Num rows: 5 Data size: 314 Basic stats: COMPLETE Column stats: NONE
                  table:
                       input format: org.apache.hadoop.mapred.TextInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

75 rows selected (0.39 seconds)


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from monthly_taxi_fleet5 where company='Comfort';
INFO  : Compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785): select * from monthly_taxi_fleet5 where company='Comfort'
INFO  : Compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785): INSERT OVERWRITE DIRECTORY "hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/5a539317-9cca-43b8-b460-de23c3d7c159/hive_2017-12-25_19-13-24_864_2203428447630231527-6/-mr-10003" SELECT bucketname AS `_bucketname` , COLLECT_SET(offset) AS `_offsets` FROM (SELECT `_bucketname` AS bucketname , `_offset` AS offset FROM (SELECT * FROM `default__monthly_taxi_fleet5_monthly_taxi_fleet5_company__` WHERE (company = 'Comfort')) ind0 WHERE NOT EWAH_BITMAP_EMPTY(ind0.`_bitmaps`) ) tmp_index GROUP BY bucketname
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:_bucketname, type:string, comment:null), FieldSchema(name:_offsets, type:array<bigint>, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785); Time taken: 0.07 seconds
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:monthly_taxi_fleet5.month, type:char(7), comment:null), FieldSchema(name:monthly_taxi_fleet5.company, type:varchar(50), comment:null), FieldSchema(name:monthly_taxi_fleet5.fleet, type:smallint, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785); Time taken: 0.071 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785): select * from monthly_taxi_fleet5 where company='Comfort'
INFO  : Query ID = hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785
INFO  : Total jobs = 2
INFO  : Launching Job 1 out of 2
INFO  : Starting task [Stage-3:MAPRED] in serial mode
INFO  : Number of reduce tasks not specified. Estimated from input data size: 1
INFO  : In order to change the average load for a reducer (in bytes):
INFO  :   set hive.exec.reducers.bytes.per.reducer=<number>
INFO  : In order to limit the maximum number of reducers:
INFO  :   set hive.exec.reducers.max=<number>
INFO  : In order to set a constant number of reducers:
INFO  :   set mapreduce.job.reduces=<number>
INFO  : number of splits:1
INFO  : Submitting tokens for job: job_1513984921012_0058
INFO  : The url to track the job:
http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0058/
INFO  : Starting Job = job_1513984921012_0058, Tracking URL = http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0058/
INFO  : Kill Command = /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/hadoop/bin/hadoop job  -kill job_1513984921012_0058
INFO  : Hadoop job information for Stage-3: number of mappers: 1; number of reducers: 1
INFO  : 2017-12-25 19:13:33,536 Stage-3 map = 0%,  reduce = 0%
INFO  : 2017-12-25 19:13:38,875 Stage-3 map = 100%,  reduce = 0%, Cumulative CPU 1.62 sec
INFO  : 2017-12-25 19:13:45,406 Stage-3 map = 100%,  reduce = 100%, Cumulative CPU 3.47 sec
INFO  : MapReduce Total cumulative CPU time: 3 seconds 470 msec
INFO  : Ended Job = job_1513984921012_0058
INFO  : Starting task [Stage-2:MOVE] in serial mode
INFO  : Moving data to: hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/5a539317-9cca-43b8-b460-de23c3d7c159/hive_2017-12-25_19-13-24_864_2203428447630231527-6/-mr-10003 from hdfs://cdh-vm.dbaglobe.com:8020/tmp/hive/donghua/5a539317-9cca-43b8-b460-de23c3d7c159/hive_2017-12-25_19-13-24_864_2203428447630231527-6/-mr-10003/.hive-staging_hive_2017-12-25_19-13-24_972_8970662548653177884-6/-ext-10000
INFO  : Launching Job 2 out of 2
INFO  : Starting task [Stage-1:MAPRED] in serial mode
INFO  : Number of reduce tasks is set to 0 since there's no reduce operator
INFO  : number of splits:2
INFO  : Submitting tokens for job: job_1513984921012_0059
INFO  : The url to track the job:
http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0059/
INFO  : Starting Job = job_1513984921012_0059, Tracking URL = http://cdh-vm.dbaglobe.com:8088/proxy/application_1513984921012_0059/
INFO  : Kill Command = /opt/cloudera/parcels/CDH-5.13.1-1.cdh5.13.1.p0.2/lib/hadoop/bin/hadoop job  -kill job_1513984921012_0059
INFO  : Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 0
INFO  : 2017-12-25 19:13:54,773 Stage-1 map = 0%,  reduce = 0%
INFO  : 2017-12-25 19:14:04,866 Stage-1 map = 50%,  reduce = 0%, Cumulative CPU 1.79 sec
INFO  : 2017-12-25 19:14:06,983 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 4.9 sec
INFO  : MapReduce Total cumulative CPU time: 4 seconds 900 msec
INFO  : Ended Job = job_1513984921012_0059
INFO  : MapReduce Jobs Launched:
INFO  : Stage-Stage-3: Map: 1  Reduce: 1   Cumulative CPU: 3.47 sec   HDFS Read: 11432 HDFS Write: 88 SUCCESS
INFO  : Stage-Stage-1: Map: 2   Cumulative CPU: 4.9 sec   HDFS Read: 11938 HDFS Write: 43 SUCCESS
INFO  : Total MapReduce CPU Time Spent: 8 seconds 370 msec
INFO  : Completed executing command(queryId=hive_20171225191313_3bbd504c-a4ec-4cf8-8427-fea0f832f785); Time taken: 43.095 seconds
INFO  : OK
monthly_taxi_fleet5.month,monthly_taxi_fleet5.company,monthly_taxi_fleet5.fleet
2005-02,Comfort,10046
2005-01,Comfort,9952
2 rows selected (43.346 seconds)

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> !outputformat vertical


0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from default__monthly_taxi_fleet5_monthly_taxi_fleet5_company__ company_index;


company_index.company      CityCab
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      62
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Comfort
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      62
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Premier
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      125
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      TransCab
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      188
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Smart
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      251
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Individual Yellow- Top
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      314
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      YTC
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      376
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      SMRT
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      439
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      CityCab
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      502
company_index._bitmaps     [1,2,4,8589934592,1,0]

company_index.company      Comfort
company_index._bucketname  hdfs://cdh-vm.dbaglobe.com:8020/user/hive/warehouse/monthly_taxi_fleet5/000000_0
company_index._offset      565
company_index._bitmaps     [1,2,4,8589934592,1,0]

10 rows selected (0.143 seconds)

Example to drop the index:

0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> drop index monthly_taxi_fleet5_company on monthly_taxi_fleet5;