diff --git a/ch03/README.md b/ch03/README.md index d3e75fe..3eddfef 100644 --- a/ch03/README.md +++ b/ch03/README.md @@ -27,10 +27,15 @@ cd gmail ## Download Apache Pig ## ``` -wget http://www.trieuvan.com/apache/pig/pig-0.10.1/pig-0.10.1.tar.gz -tar -xvzf pig-0.10.1.tar.gz -cd pig-0.10.1 -ant +wget http://mirrors.ibiblio.org/apache/pig/pig-0.12.0/pig-0.12.0.tar.gz +tar -xvzf pig-*.tar.gz +cd pig-0.12.0 +``` + +## Compile Pig for Hadoop 2.0.x ## + +``` +ant clean jar-withouthadoop -Dhadoopversion=23 ``` Now you can run 'bin/pig'! @@ -89,11 +94,12 @@ bin/mongo agile_data ## Install MongoDB's Java Driver ## -The MongoDB Java driver is available at https://github.com/mongodb/mongo-java-driver/downloads Download it, and place it at the base of your MongoDB install directory. +The MongoDB Java driver is available at https://github.com/mongodb/mongo-java-driver/downloads or you can download recent snapshots like the one below. Register the path in pig/mongo.pig ``` cd -wget https://github.com/downloads/mongodb/mongo-java-driver/mongo-2.10.1.jar +wget +https://oss.sonatype.org/content/repositories/snapshots/org/mongodb/mongo-java-driver/2.12.0-SNAPSHOT/mongo-java-driver-2.12.0-20140213.053134-54.jar ``` ## Install mongo-hadoop ## @@ -120,15 +126,15 @@ find .|grep jar Fix the paths in 'ch3/pig/mongo.pig' to point at your install paths and run it, to store the email sent counts to MongoDB. ``` -REGISTER /mongo-2.10.1.jar -REGISTER /core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar -REGISTER /pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar +REGISTER $HOME/mongo-java-driver*.jar +REGISTER $HOME/core/target/mongo-hadoop-core-*.jar +REGISTER $HOME/pig/target/mongo-hadoop-pig-*.jar set mapred.map.tasks.speculative.execution false set mapred.reduce.tasks.speculative.execution false sent_counts = LOAD '/tmp/sent_counts.txt' AS (from:chararray, to:chararray, total:long); -STORE sent_counts INTO 'mongodb://localhost/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoStorage(); +STORE sent_counts INTO 'mongodb://localhost/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoInsertStorage('',''); ``` ## Connect to MongoDB from Python ## diff --git a/ch03/pig/avro_to_mongo.pig b/ch03/pig/avro_to_mongo.pig index 768a681..87f6add 100644 --- a/ch03/pig/avro_to_mongo.pig +++ b/ch03/pig/avro_to_mongo.pig @@ -2,20 +2,19 @@ %default HOME `echo \$HOME/Software/` /* Load Avro jars and define shortcut */ -REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar -REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar -REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar -define AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage(); +REGISTER $HOME/pig/build/ivy/lib/Pig/avro-*.jar +REGISTER /$HOME/pig/build/ivy/lib/Pig/json-simple-*.jar +DEFINE AvroStorage org.apache.pig.builtin.AvroStorage(); /* MongoDB libraries and configuration */ -REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar -REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar -REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar +REGISTER $HOME/mongo-java-driver*.jar +REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core_2.2.0-1.2.0.jar +REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig_2.2.0-1.2.0.jar /* Set speculative execution off so we don't have the chance of duplicate records in Mongo */ set mapred.map.tasks.speculative.execution false set mapred.reduce.tasks.speculative.execution false define MongoStorage com.mongodb.hadoop.pig.MongoStorage(); /* Shortcut */ -avros = load '$avros' using AvroStorage(); /* For example, 'enron.avro' */ -store avros into '$mongourl' using MongoStorage(); /* For example, 'mongodb://localhost/enron.emails' */ +avros = load '/tmp/sent_counts.txt' using AvroStorage(); /* For example, 'enron.avro' */ +store avros into 'mongodb://localhost/agile_date.sent_counts' using MongoInsertStorage(); /* For example, 'mongodb://localhost/enron.emails' */ diff --git a/ch03/pig/mongo.pig b/ch03/pig/mongo.pig old mode 100644 new mode 100755 index 327ed50..dc6f953 --- a/ch03/pig/mongo.pig +++ b/ch03/pig/mongo.pig @@ -1,12 +1,11 @@ /* Set Home Directory - where we install software */ %default HOME `echo \$HOME/Software/` - -REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar -REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar -REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar +REGISTER $HOME/mongo-java-driver*.jar +REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core_2.2.0-1.2.0.jar +REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig_2.2.0-1.2.0.jar set mapred.map.tasks.speculative.execution false set mapred.reduce.tasks.speculative.execution false sent_counts = LOAD '/tmp/sent_counts.txt' AS (from:chararray, to:chararray, total:long); -STORE sent_counts INTO 'mongodb://localhost/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoStorage(); +STORE sent_counts INTO 'mongodb://127.0.0.1:27017/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoInsertStorage('',''); diff --git a/ch03/pig/sent_counts.pig b/ch03/pig/sent_counts.pig index f6a44fd..05f4cf1 100644 --- a/ch03/pig/sent_counts.pig +++ b/ch03/pig/sent_counts.pig @@ -1,16 +1,15 @@ /* Set Home Directory - where we install software */ %default HOME `echo \$HOME/Software/` -REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar -REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar -REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar - -DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage(); +REGISTER $HOME/pig/pig-*.jar +REGISTER $HOME/pig/build/ivy/lib/Pig/avro-*.jar +REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-*.jar +DEFINE AvroStorage org.apache.pig.builtin.AvroStorage(); rmf /tmp/sent_counts.txt /* Load the emails in avro format (edit the path to match where you saved them) using the AvroStorage UDF from Piggybank */ -messages = LOAD '/me/Data/test_mbox' USING AvroStorage(); +messages = LOAD '/Data/test_mbox' USING AvroStorage(); /* Filter nulls, they won't help */ messages = FILTER messages BY (from IS NOT NULL) AND (tos IS NOT NULL);