More Hive snippets.

dleung · Jan 28, 2012 · ae3337a · ae3337a
1 parent ab711a4
commit ae3337a
Show file tree

Hide file tree

Showing 17 changed files with 269 additions and 153 deletions.
diff --git a/ch12/src/main/hive/buckets.hive b/ch12/src/main/hive/buckets.hive
@@ -0,0 +1,46 @@
+! echo; # == select_users;
+! echo; # == sample_1_in_4;
+! echo; # == sample_1_in_2;
+
+DROP TABLE IF EXISTS users;
+
+CREATE TABLE users (id INT, name STRING);
+
+LOAD DATA LOCAL INPATH 'input/hive/tables/users.txt'
+OVERWRITE INTO TABLE users;
+
+dfs -cat /user/hive/warehouse/users/users.txt;
+
+DROP TABLE IF EXISTS bucketed_users;
+
+CREATE TABLE bucketed_users (id INT, name STRING)
+CLUSTERED BY (id) INTO 4 BUCKETS;
+
+DROP TABLE bucketed_users;
+
+CREATE TABLE bucketed_users (id INT, name STRING)
+CLUSTERED BY (id) SORTED BY (id) INTO 4 BUCKETS;
+
+! echo; # vv select_users;
+SELECT * FROM users;
+! echo; # ^^ select_users;
+
+SET hive.enforce.bucketing=true;
+
+INSERT OVERWRITE TABLE bucketed_users
+SELECT * FROM users;
+
+dfs -ls /user/hive/warehouse/bucketed_users;
+
+! echo; # vv sample_1_in_4;
+SELECT * FROM bucketed_users
+TABLESAMPLE(BUCKET 1 OUT OF 4 ON id);
+! echo; # ^^ sample_1_in_4;
+
+! echo; # vv sample_1_in_2;
+SELECT * FROM bucketed_users
+TABLESAMPLE(BUCKET 1 OUT OF 2 ON id);
+! echo; # ^^ sample_1_in_2;
+
+SELECT * FROM users
+TABLESAMPLE(BUCKET 1 OUT OF 4 ON rand());
diff --git a/ch12/src/main/hive/buckets.txt b/ch12/src/main/hive/buckets.txt
diff --git a/ch12/src/main/hive/joins.hive b/ch12/src/main/hive/joins.hive
@@ -0,0 +1,66 @@
+! echo; # == select_sales_and_things;
+! echo; # == inner_join;
+! echo; # == left_outer_join;
+! echo; # == right_outer_join;
+! echo; # == full_outer_join;
+! echo; # == select_things_in_sales;
+! echo; # == semi_join;
+! echo; # == map_join;
+
+DROP TABLE IF EXISTS sales;
+CREATE TABLE sales (name STRING, id INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
+
+DROP TABLE IF EXISTS things;
+CREATE TABLE things (id INT, name STRING)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
+
+LOAD DATA LOCAL INPATH 'input/hive/joins/sales.txt'
+OVERWRITE INTO TABLE sales;
+
+LOAD DATA LOCAL INPATH 'input/hive/joins/things.txt'
+OVERWRITE INTO TABLE things;
+
+! echo; # vv select_sales_and_things;
+SELECT * FROM sales;
+SELECT * FROM things;
+! echo; # ^^ select_sales_and_things;
+
+! echo; # vv inner_join;
+SELECT sales.*, things.*
+FROM sales JOIN things ON (sales.id = things.id);
+! echo; # ^^ inner_join;
+
+! echo; # vv left_outer_join;
+SELECT sales.*, things.*
+FROM sales LEFT OUTER JOIN things ON (sales.id = things.id);
+! echo; # ^^ left_outer_join;
+
+! echo; # vv right_outer_join;
+SELECT sales.*, things.*
+FROM sales RIGHT OUTER JOIN things ON (sales.id = things.id);
+! echo; # ^^ right_outer_join;
+
+! echo; # vv full_outer_join;
+SELECT sales.*, things.*
+FROM sales FULL OUTER JOIN things ON (sales.id = things.id);
+! echo; # ^^ full_outer_join;
+
+! echo; # vv select_things_in_sales;
+SELECT *
+FROM things
+WHERE things.id IN (SELECT id from sales);
+! echo; # ^^ select_things_in_sales;
+
+! echo; # vv semi_join;
+SELECT *
+FROM things LEFT SEMI JOIN sales ON (sales.id = things.id);
+! echo; # ^^ semi_join;
+
+! echo; # vv map_join;
+SELECT /*+ MAPJOIN(things) */ sales.*, things.*
+FROM sales JOIN things ON (sales.id = things.id);
+! echo; # ^^ map_join;
+
+SELECT /*+ MAPJOIN(things) */ sales.*, things.*
+FROM sales LEFT OUTER JOIN things ON (sales.id = things.id);
diff --git a/ch12/src/main/hive/joins.txt b/ch12/src/main/hive/joins.txt
diff --git a/ch12/src/main/hive/mapreduce.txt → ch12/src/main/hive/mapreduce.hive b/ch12/src/main/hive/mapreduce.txt → ch12/src/main/hive/mapreduce.hive
@@ -1,10 +1,23 @@
-ADD FILE /Users/tom/book-workspace/hadoop-book/ch12/src/main/python/is_good_quality.py; 
-ADD FILE /Users/tom/book-workspace/hadoop-book/ch12/src/main/python/max_temperature_reduce.py;
+! echo; # == mapreduce_is_good_quality;
+
+DROP TABLE IF EXISTS records2;
+
+CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
+
+LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample2.txt'
+OVERWRITE INTO TABLE records2;
 
+! echo; # vv mapreduce_is_good_quality;
+ADD FILE /Users/tom/book-workspace/hadoop-book/ch12/src/main/python/is_good_quality.py; 
 FROM records2
 SELECT TRANSFORM(year, temperature, quality)
 USING 'is_good_quality.py'
 AS year, temperature;
+! echo; # ^^ mapreduce_is_good_quality;
+
+ADD FILE ch12/src/main/python/max_temperature_reduce.py;
 
 FROM (
   FROM records2
@@ -23,4 +36,3 @@ FROM (
 SELECT TRANSFORM(year, temperature)
 USING 'max_temperature_reduce.py'
 AS year, temperature;
-
diff --git a/ch12/src/main/hive/max_temp.hive.old → ch12/src/main/hive/multitable_insert.hive b/ch12/src/main/hive/max_temp.hive.old → ch12/src/main/hive/multitable_insert.hive
@@ -1,38 +1,15 @@
-CREATE TABLE records (year STRING, temperature INT, quality INT)
-ROW FORMAT DELIMITED
-  FIELDS TERMINATED BY '\t';
-
-LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample.txt'
-OVERWRITE INTO TABLE records;
-
-SELECT year, MAX(temperature)
-FROM records
-WHERE temperature != 9999
-  AND (quality = 0 OR quality = 1 OR quality = 4 OR quality = 5 OR quality = 9)
-GROUP BY year;
-
--- weather dataset stats
 
+DROP TABLE IF exists records2;
 CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
 ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '\t';
 
 LOAD DATA LOCAL INPATH '/Users/tom/book-workspace/hadoop-book/input/ncdc/micro-tab/sample2.txt'
 OVERWRITE INTO TABLE records2;
 
--- total stations and records (by year)
-SELECT year, COUNT(DISTINCT station), COUNT(1)
-FROM records2
-GROUP BY year;
-
--- total good records (by year)
-SELECT year, COUNT(1)
-FROM records2
-WHERE temperature != 9999
-  AND (quality = 0 OR quality = 1 OR quality = 4 OR quality = 5 OR quality = 9)
-GROUP BY year;
-
--- scans data twice...
+DROP TABLE IF exists stations_by_year;
+DROP TABLE IF exists records_by_year;
+DROP TABLE IF exists good_records_by_year;
 
 CREATE TABLE stations_by_year (year STRING, num INT);
 CREATE TABLE records_by_year (year STRING, num INT);
@@ -51,26 +28,12 @@ INSERT OVERWRITE TABLE good_records_by_year
     AND (quality = 0 OR quality = 1 OR quality = 4 OR quality = 5 OR quality = 9)
   GROUP BY year;  
 
+SELECT * FROM stations_by_year;
+SELECT * FROM records_by_year;
+SELECT * FROM good_records_by_year;
+
 SELECT /*+ MAPJOIN(records_by_year,good_records_by_year) */
   stations_by_year.year, stations_by_year.num, records_by_year.num, good_records_by_year.num
 FROM stations_by_year
   JOIN records_by_year ON (stations_by_year.year = records_by_year.year)
   JOIN good_records_by_year ON (stations_by_year.year = good_records_by_year.year);
-
-
--- 
-
--- DISTRIBUTED BY
-
-FROM records2
-SELECT year, temperature
-DISTRIBUTE BY year
-SORT BY year ASC, temperature DESC;
-
-FROM (
-FROM records2
-SELECT year, temperature
-DISTRIBUTE BY year
-SORT BY year ASC, temperature DESC) t
-SELECT year, MAX(temperature)
-GROUP BY year;
diff --git a/ch12/src/main/hive/partitions.hive b/ch12/src/main/hive/partitions.hive
@@ -33,4 +33,4 @@ SHOW PARTITIONS logs;
 
 SELECT ts, dt, line
 FROM logs
-WHERE country='GB';
+WHERE country='GB';
diff --git a/ch12/src/main/hive/rcfile.txt b/ch12/src/main/hive/rcfile.txt
diff --git a/ch12/src/main/hive/regex_serde.hive b/ch12/src/main/hive/regex_serde.hive
@@ -0,0 +1,16 @@
+! echo; # == select_stations;
+
+DROP TABLE IF EXISTS stations;
+
+CREATE TABLE stations (usaf STRING, wban STRING, name STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "(\\d{6}) (\\d{5}) (.{29}) .*"
+);
+
+LOAD DATA LOCAL INPATH "input/ncdc/metadata/stations-fixed-width.txt"
+INTO TABLE stations;
+
+! echo; # vv select_stations;
+SELECT * FROM stations LIMIT 4;
+! echo; # ^^ select_stations;
diff --git a/ch12/src/main/hive/set.hive b/ch12/src/main/hive/set.hive
@@ -0,0 +1,15 @@
+! echo; # == define_function;
+! echo; # == set_value;
+! echo; # == set_show_value;
+
+! echo; # vv define_function;
+DESCRIBE FUNCTION length;
+! echo; # ^^ define_function;
+
+! echo; # vv set_value;
+SET hive.enforce.bucketing=true;
+! echo; # ^^ set_value;
+
+! echo; # vv set_show_value;
+SET hive.enforce.bucketing;
+! echo; # ^^ set_show_value;
diff --git a/ch12/src/main/hive/sort.hive b/ch12/src/main/hive/sort.hive
@@ -0,0 +1,17 @@
+! echo; # == sort_by_year;
+
+DROP TABLE IF EXISTS records2;
+
+CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
+
+LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample2.txt'
+OVERWRITE INTO TABLE records2;
+
+! echo; # vv sort_by_year;
+FROM records2
+SELECT year, temperature
+DISTRIBUTE BY year
+SORT BY year ASC, temperature DESC;
+! echo; # ^^ sort_by_year;