Tuesday, December 26, 2017

Hive table sampling explained with examples

-- Leverage prebuild buckets
select * from monthly_taxi_fleet6 tablesample(bucket 1 out of 3 on month);

-- Leverage prebuild buckets, split it from 3 buckets into 10 buckets dynamically)
select * from monthly_taxi_fleet6 tablesample(bucket 1 out of 10 on month);

-- Dynamically build bucket on company column
select * from monthly_taxi_fleet6 tablesample(bucket 1 out of 3 on company);

-- block based sampling
select * from monthly_taxi_fleet6 tablesample(5 percent);

-- block based sampling, limit input by storage size
select * from monthly_taxi_fleet6 tablesample(5M);

-- row based sampling, limiting input by row count basis
select * from monthly_taxi_fleet6 tablesample(10 rows);