Skip to content

Commit

Permalink
More Hive snippets.
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jan 28, 2012
1 parent ab711a4 commit ae3337a
Show file tree
Hide file tree
Showing 17 changed files with 269 additions and 153 deletions.
46 changes: 46 additions & 0 deletions ch12/src/main/hive/buckets.hive
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
! echo; # == select_users;
! echo; # == sample_1_in_4;
! echo; # == sample_1_in_2;

DROP TABLE IF EXISTS users;

CREATE TABLE users (id INT, name STRING);

LOAD DATA LOCAL INPATH 'input/hive/tables/users.txt'
OVERWRITE INTO TABLE users;

dfs -cat /user/hive/warehouse/users/users.txt;

DROP TABLE IF EXISTS bucketed_users;

CREATE TABLE bucketed_users (id INT, name STRING)
CLUSTERED BY (id) INTO 4 BUCKETS;

DROP TABLE bucketed_users;

CREATE TABLE bucketed_users (id INT, name STRING)
CLUSTERED BY (id) SORTED BY (id) INTO 4 BUCKETS;

! echo; # vv select_users;
SELECT * FROM users;
! echo; # ^^ select_users;

SET hive.enforce.bucketing=true;

INSERT OVERWRITE TABLE bucketed_users
SELECT * FROM users;

dfs -ls /user/hive/warehouse/bucketed_users;

! echo; # vv sample_1_in_4;
SELECT * FROM bucketed_users
TABLESAMPLE(BUCKET 1 OUT OF 4 ON id);
! echo; # ^^ sample_1_in_4;

! echo; # vv sample_1_in_2;
SELECT * FROM bucketed_users
TABLESAMPLE(BUCKET 1 OUT OF 2 ON id);
! echo; # ^^ sample_1_in_2;

SELECT * FROM users
TABLESAMPLE(BUCKET 1 OUT OF 4 ON rand());
47 changes: 0 additions & 47 deletions ch12/src/main/hive/buckets.txt

This file was deleted.

66 changes: 66 additions & 0 deletions ch12/src/main/hive/joins.hive
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
! echo; # == select_sales_and_things;
! echo; # == inner_join;
! echo; # == left_outer_join;
! echo; # == right_outer_join;
! echo; # == full_outer_join;
! echo; # == select_things_in_sales;
! echo; # == semi_join;
! echo; # == map_join;

DROP TABLE IF EXISTS sales;
CREATE TABLE sales (name STRING, id INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

DROP TABLE IF EXISTS things;
CREATE TABLE things (id INT, name STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

LOAD DATA LOCAL INPATH 'input/hive/joins/sales.txt'
OVERWRITE INTO TABLE sales;

LOAD DATA LOCAL INPATH 'input/hive/joins/things.txt'
OVERWRITE INTO TABLE things;

! echo; # vv select_sales_and_things;
SELECT * FROM sales;
SELECT * FROM things;
! echo; # ^^ select_sales_and_things;

! echo; # vv inner_join;
SELECT sales.*, things.*
FROM sales JOIN things ON (sales.id = things.id);
! echo; # ^^ inner_join;

! echo; # vv left_outer_join;
SELECT sales.*, things.*
FROM sales LEFT OUTER JOIN things ON (sales.id = things.id);
! echo; # ^^ left_outer_join;

! echo; # vv right_outer_join;
SELECT sales.*, things.*
FROM sales RIGHT OUTER JOIN things ON (sales.id = things.id);
! echo; # ^^ right_outer_join;

! echo; # vv full_outer_join;
SELECT sales.*, things.*
FROM sales FULL OUTER JOIN things ON (sales.id = things.id);
! echo; # ^^ full_outer_join;

! echo; # vv select_things_in_sales;
SELECT *
FROM things
WHERE things.id IN (SELECT id from sales);
! echo; # ^^ select_things_in_sales;

! echo; # vv semi_join;
SELECT *
FROM things LEFT SEMI JOIN sales ON (sales.id = things.id);
! echo; # ^^ semi_join;

! echo; # vv map_join;
SELECT /*+ MAPJOIN(things) */ sales.*, things.*
FROM sales JOIN things ON (sales.id = things.id);
! echo; # ^^ map_join;

SELECT /*+ MAPJOIN(things) */ sales.*, things.*
FROM sales LEFT OUTER JOIN things ON (sales.id = things.id);
49 changes: 0 additions & 49 deletions ch12/src/main/hive/joins.txt

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
ADD FILE /Users/tom/book-workspace/hadoop-book/ch12/src/main/python/is_good_quality.py;
ADD FILE /Users/tom/book-workspace/hadoop-book/ch12/src/main/python/max_temperature_reduce.py;
! echo; # == mapreduce_is_good_quality;

DROP TABLE IF EXISTS records2;

CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample2.txt'
OVERWRITE INTO TABLE records2;

! echo; # vv mapreduce_is_good_quality;
ADD FILE /Users/tom/book-workspace/hadoop-book/ch12/src/main/python/is_good_quality.py;
FROM records2
SELECT TRANSFORM(year, temperature, quality)
USING 'is_good_quality.py'
AS year, temperature;
! echo; # ^^ mapreduce_is_good_quality;

ADD FILE ch12/src/main/python/max_temperature_reduce.py;

FROM (
FROM records2
Expand All @@ -23,4 +36,3 @@ FROM (
SELECT TRANSFORM(year, temperature)
USING 'max_temperature_reduce.py'
AS year, temperature;

Original file line number Diff line number Diff line change
@@ -1,38 +1,15 @@
CREATE TABLE records (year STRING, temperature INT, quality INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample.txt'
OVERWRITE INTO TABLE records;

SELECT year, MAX(temperature)
FROM records
WHERE temperature != 9999
AND (quality = 0 OR quality = 1 OR quality = 4 OR quality = 5 OR quality = 9)
GROUP BY year;

-- weather dataset stats

DROP TABLE IF exists records2;
CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

LOAD DATA LOCAL INPATH '/Users/tom/book-workspace/hadoop-book/input/ncdc/micro-tab/sample2.txt'
OVERWRITE INTO TABLE records2;

-- total stations and records (by year)
SELECT year, COUNT(DISTINCT station), COUNT(1)
FROM records2
GROUP BY year;

-- total good records (by year)
SELECT year, COUNT(1)
FROM records2
WHERE temperature != 9999
AND (quality = 0 OR quality = 1 OR quality = 4 OR quality = 5 OR quality = 9)
GROUP BY year;

-- scans data twice...
DROP TABLE IF exists stations_by_year;
DROP TABLE IF exists records_by_year;
DROP TABLE IF exists good_records_by_year;

CREATE TABLE stations_by_year (year STRING, num INT);
CREATE TABLE records_by_year (year STRING, num INT);
Expand All @@ -51,26 +28,12 @@ INSERT OVERWRITE TABLE good_records_by_year
AND (quality = 0 OR quality = 1 OR quality = 4 OR quality = 5 OR quality = 9)
GROUP BY year;

SELECT * FROM stations_by_year;
SELECT * FROM records_by_year;
SELECT * FROM good_records_by_year;

SELECT /*+ MAPJOIN(records_by_year,good_records_by_year) */
stations_by_year.year, stations_by_year.num, records_by_year.num, good_records_by_year.num
FROM stations_by_year
JOIN records_by_year ON (stations_by_year.year = records_by_year.year)
JOIN good_records_by_year ON (stations_by_year.year = good_records_by_year.year);


--

-- DISTRIBUTED BY

FROM records2
SELECT year, temperature
DISTRIBUTE BY year
SORT BY year ASC, temperature DESC;

FROM (
FROM records2
SELECT year, temperature
DISTRIBUTE BY year
SORT BY year ASC, temperature DESC) t
SELECT year, MAX(temperature)
GROUP BY year;
2 changes: 1 addition & 1 deletion ch12/src/main/hive/partitions.hive
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ SHOW PARTITIONS logs;

SELECT ts, dt, line
FROM logs
WHERE country='GB';
WHERE country='GB';
3 changes: 0 additions & 3 deletions ch12/src/main/hive/rcfile.txt

This file was deleted.

16 changes: 16 additions & 0 deletions ch12/src/main/hive/regex_serde.hive
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
! echo; # == select_stations;

DROP TABLE IF EXISTS stations;

CREATE TABLE stations (usaf STRING, wban STRING, name STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\\d{6}) (\\d{5}) (.{29}) .*"
);

LOAD DATA LOCAL INPATH "input/ncdc/metadata/stations-fixed-width.txt"
INTO TABLE stations;

! echo; # vv select_stations;
SELECT * FROM stations LIMIT 4;
! echo; # ^^ select_stations;
15 changes: 15 additions & 0 deletions ch12/src/main/hive/set.hive
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
! echo; # == define_function;
! echo; # == set_value;
! echo; # == set_show_value;

! echo; # vv define_function;
DESCRIBE FUNCTION length;
! echo; # ^^ define_function;

! echo; # vv set_value;
SET hive.enforce.bucketing=true;
! echo; # ^^ set_value;

! echo; # vv set_show_value;
SET hive.enforce.bucketing;
! echo; # ^^ set_show_value;
17 changes: 17 additions & 0 deletions ch12/src/main/hive/sort.hive
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
! echo; # == sort_by_year;

DROP TABLE IF EXISTS records2;

CREATE TABLE records2 (station STRING, year STRING, temperature INT, quality INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

LOAD DATA LOCAL INPATH 'input/ncdc/micro-tab/sample2.txt'
OVERWRITE INTO TABLE records2;

! echo; # vv sort_by_year;
FROM records2
SELECT year, temperature
DISTRIBUTE BY year
SORT BY year ASC, temperature DESC;
! echo; # ^^ sort_by_year;
Loading

0 comments on commit ae3337a

Please sign in to comment.