From 7edf6f66ee9421d5dbf991fc2cdc5393d0c3d338 Mon Sep 17 00:00:00 2001 From: Jason Stirnaman Date: Tue, 18 Feb 2014 22:55:50 -0600 Subject: [PATCH 1/3] Updated for compatibility with Pig 0.12 and Hadoop 2.0.x. AvroStorage() is now built-in to Pig; Piggybank not required. --- ch03/README.md | 13 +++++++++---- ch03/pig/sent_counts.pig | 11 +++++------ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/ch03/README.md b/ch03/README.md index d3e75fe..8299ee9 100644 --- a/ch03/README.md +++ b/ch03/README.md @@ -27,10 +27,15 @@ cd gmail ## Download Apache Pig ## ``` -wget http://www.trieuvan.com/apache/pig/pig-0.10.1/pig-0.10.1.tar.gz -tar -xvzf pig-0.10.1.tar.gz -cd pig-0.10.1 -ant +wget http://mirrors.ibiblio.org/apache/pig/pig-0.12.0/pig-0.12.0.tar.gz +tar -xvzf pig-*.tar.gz +cd pig-0.12.0 +``` + +## Compile Pig for Hadoop 2.0.x ## + +``` +ant clean jar-withouthadoop -Dhadoopversion=23 ``` Now you can run 'bin/pig'! diff --git a/ch03/pig/sent_counts.pig b/ch03/pig/sent_counts.pig index f6a44fd..05f4cf1 100644 --- a/ch03/pig/sent_counts.pig +++ b/ch03/pig/sent_counts.pig @@ -1,16 +1,15 @@ /* Set Home Directory - where we install software */ %default HOME `echo \$HOME/Software/` -REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar -REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar -REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar - -DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage(); +REGISTER $HOME/pig/pig-*.jar +REGISTER $HOME/pig/build/ivy/lib/Pig/avro-*.jar +REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-*.jar +DEFINE AvroStorage org.apache.pig.builtin.AvroStorage(); rmf /tmp/sent_counts.txt /* Load the emails in avro format (edit the path to match where you saved them) using the AvroStorage UDF from Piggybank */ -messages = LOAD '/me/Data/test_mbox' USING AvroStorage(); +messages = LOAD '/Data/test_mbox' USING AvroStorage(); /* Filter nulls, they won't help */ messages = FILTER messages BY (from IS NOT NULL) AND (tos IS NOT NULL); From 1d6442cb61656f7a0d1d727f5bdde71a92ebf57d Mon Sep 17 00:00:00 2001 From: Jason Stirnaman Date: Wed, 19 Feb 2014 21:35:38 -0600 Subject: [PATCH 2/3] Updated MongoStorage() to MongoInsertStorage as described in https://github.com/alabid/mongo-hadoop/blob/issues/pig/mongo-update-storage/pig/README.md. Updated MongoDB Java driver download link to snapshot link and added register path to mongo.pig. --- ch03/README.md | 13 +++++++------ ch03/pig/avro_to_mongo.pig | 21 ++++++++++++--------- ch03/pig/mongo.pig | 9 ++++----- 3 files changed, 23 insertions(+), 20 deletions(-) mode change 100644 => 100755 ch03/pig/mongo.pig diff --git a/ch03/README.md b/ch03/README.md index 8299ee9..3eddfef 100644 --- a/ch03/README.md +++ b/ch03/README.md @@ -94,11 +94,12 @@ bin/mongo agile_data ## Install MongoDB's Java Driver ## -The MongoDB Java driver is available at https://github.com/mongodb/mongo-java-driver/downloads Download it, and place it at the base of your MongoDB install directory. +The MongoDB Java driver is available at https://github.com/mongodb/mongo-java-driver/downloads or you can download recent snapshots like the one below. Register the path in pig/mongo.pig ``` cd -wget https://github.com/downloads/mongodb/mongo-java-driver/mongo-2.10.1.jar +wget +https://oss.sonatype.org/content/repositories/snapshots/org/mongodb/mongo-java-driver/2.12.0-SNAPSHOT/mongo-java-driver-2.12.0-20140213.053134-54.jar ``` ## Install mongo-hadoop ## @@ -125,15 +126,15 @@ find .|grep jar Fix the paths in 'ch3/pig/mongo.pig' to point at your install paths and run it, to store the email sent counts to MongoDB. ``` -REGISTER /mongo-2.10.1.jar -REGISTER /core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar -REGISTER /pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar +REGISTER $HOME/mongo-java-driver*.jar +REGISTER $HOME/core/target/mongo-hadoop-core-*.jar +REGISTER $HOME/pig/target/mongo-hadoop-pig-*.jar set mapred.map.tasks.speculative.execution false set mapred.reduce.tasks.speculative.execution false sent_counts = LOAD '/tmp/sent_counts.txt' AS (from:chararray, to:chararray, total:long); -STORE sent_counts INTO 'mongodb://localhost/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoStorage(); +STORE sent_counts INTO 'mongodb://localhost/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoInsertStorage('',''); ``` ## Connect to MongoDB from Python ## diff --git a/ch03/pig/avro_to_mongo.pig b/ch03/pig/avro_to_mongo.pig index 768a681..4ba7894 100644 --- a/ch03/pig/avro_to_mongo.pig +++ b/ch03/pig/avro_to_mongo.pig @@ -2,20 +2,23 @@ %default HOME `echo \$HOME/Software/` /* Load Avro jars and define shortcut */ -REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar -REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar -REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar -define AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage(); +REGISTER $HOME/pig/build/ivy/lib/Pig/avro-*.jar +REGISTER /$HOME/pig/build/ivy/lib/Pig/json-simple-*.jar +DEFINE AvroStorage org.apache.pig.builtin.AvroStorage(); /* MongoDB libraries and configuration */ -REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar -REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar -REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar +REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core_1.0.4-1.2.0.jar +REGISTER $HOME/mongo-hadoop/flume/target/mongo-flume-1.2.0.jar +REGISTER $HOME/mongo-hadoop/gradle/wrapper/gradle-wrapper.jar +REGISTER $HOME/mongo-hadoop/hive/target/mongo-hadoop-hive_1.0.4-1.2.0.jar +REGISTER $HOME/mongo-hadoop/mongo-2.4.jar +REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig_1.0.4-1.2.0.jar +REGISTER $HOME/mongo-hadoop/target/mongo-hadoop_1.0.4-1.2.0.jar /* Set speculative execution off so we don't have the chance of duplicate records in Mongo */ set mapred.map.tasks.speculative.execution false set mapred.reduce.tasks.speculative.execution false define MongoStorage com.mongodb.hadoop.pig.MongoStorage(); /* Shortcut */ -avros = load '$avros' using AvroStorage(); /* For example, 'enron.avro' */ -store avros into '$mongourl' using MongoStorage(); /* For example, 'mongodb://localhost/enron.emails' */ +avros = load '/tmp/sent_counts.txt' using AvroStorage(); /* For example, 'enron.avro' */ +store avros into 'mongodb://localhost/agile_date.sent_counts' using MongoStorage(); /* For example, 'mongodb://localhost/enron.emails' */ diff --git a/ch03/pig/mongo.pig b/ch03/pig/mongo.pig old mode 100644 new mode 100755 index 327ed50..dc6f953 --- a/ch03/pig/mongo.pig +++ b/ch03/pig/mongo.pig @@ -1,12 +1,11 @@ /* Set Home Directory - where we install software */ %default HOME `echo \$HOME/Software/` - -REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar -REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar -REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar +REGISTER $HOME/mongo-java-driver*.jar +REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core_2.2.0-1.2.0.jar +REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig_2.2.0-1.2.0.jar set mapred.map.tasks.speculative.execution false set mapred.reduce.tasks.speculative.execution false sent_counts = LOAD '/tmp/sent_counts.txt' AS (from:chararray, to:chararray, total:long); -STORE sent_counts INTO 'mongodb://localhost/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoStorage(); +STORE sent_counts INTO 'mongodb://127.0.0.1:27017/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoInsertStorage('',''); From a007586360d61e1cbfba3aa4539267d5a420f1c9 Mon Sep 17 00:00:00 2001 From: Jason Stirnaman Date: Wed, 19 Feb 2014 22:00:26 -0600 Subject: [PATCH 3/3] Updated avro_to_mongo.pig example. --- ch03/pig/avro_to_mongo.pig | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ch03/pig/avro_to_mongo.pig b/ch03/pig/avro_to_mongo.pig index 4ba7894..87f6add 100644 --- a/ch03/pig/avro_to_mongo.pig +++ b/ch03/pig/avro_to_mongo.pig @@ -7,13 +7,9 @@ REGISTER /$HOME/pig/build/ivy/lib/Pig/json-simple-*.jar DEFINE AvroStorage org.apache.pig.builtin.AvroStorage(); /* MongoDB libraries and configuration */ -REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core_1.0.4-1.2.0.jar -REGISTER $HOME/mongo-hadoop/flume/target/mongo-flume-1.2.0.jar -REGISTER $HOME/mongo-hadoop/gradle/wrapper/gradle-wrapper.jar -REGISTER $HOME/mongo-hadoop/hive/target/mongo-hadoop-hive_1.0.4-1.2.0.jar -REGISTER $HOME/mongo-hadoop/mongo-2.4.jar -REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig_1.0.4-1.2.0.jar -REGISTER $HOME/mongo-hadoop/target/mongo-hadoop_1.0.4-1.2.0.jar +REGISTER $HOME/mongo-java-driver*.jar +REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core_2.2.0-1.2.0.jar +REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig_2.2.0-1.2.0.jar /* Set speculative execution off so we don't have the chance of duplicate records in Mongo */ set mapred.map.tasks.speculative.execution false @@ -21,4 +17,4 @@ set mapred.reduce.tasks.speculative.execution false define MongoStorage com.mongodb.hadoop.pig.MongoStorage(); /* Shortcut */ avros = load '/tmp/sent_counts.txt' using AvroStorage(); /* For example, 'enron.avro' */ -store avros into 'mongodb://localhost/agile_date.sent_counts' using MongoStorage(); /* For example, 'mongodb://localhost/enron.emails' */ +store avros into 'mongodb://localhost/agile_date.sent_counts' using MongoInsertStorage(); /* For example, 'mongodb://localhost/enron.emails' */