You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bigtop.apache.org by ma...@apache.org on 2014/04/16 02:46:22 UTC
[1/2] BigPetStore - initial code drop - (lines commited by others :
nigelsavage~200, mattfenwick~200, michaelmcune~5, anushshetty~10,
jeffvance~10)
Repository: bigtop
Updated Branches:
refs/heads/master 3298063c6 -> d3da8ceb1
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/BPSGenerator.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/BPSGenerator.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/BPSGenerator.java
new file mode 100755
index 0000000..3319064
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/BPSGenerator.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.bigpetstore.generator;
+
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.bigtop.bigpetstore.util.DeveloperTools;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Mapper.Context;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a mapreduce implementation of a generator of a large sentiment
+ * analysis data set. The scenario is as follows:
+ *
+ * The number of records will (roughly) correspond to the output size - each
+ * record is about 80 bytes.
+ *
+ * 1KB set bigpetstore_records=10 1MB set bigpetstore_records=10,000 1GB set
+ * bigpetstore_records=10,000,000 1TB set bigpetstore_records=10,000,000,000
+ */
+public class BPSGenerator {
+
+ final static Logger log = LoggerFactory.getLogger(BPSGenerator.class);
+
+ public enum props {
+ // bigpetstore_splits,
+ bigpetstore_records
+ }
+
+ public static Job createJob(Path output, int records) throws IOException {
+ Configuration c = new Configuration();
+ c.setInt(props.bigpetstore_records.name(), 10);
+ return createJob(output, c);
+ }
+
+ public static Job createJob(Path output, Configuration conf)
+ throws IOException {
+ Job job = new Job(conf, "PetStoreTransaction_ETL_"
+ + System.currentTimeMillis());
+ // recursively delete the data set if it exists.
+ FileSystem.get(output.toUri(),conf).delete(output, true);
+ job.setJarByClass(BPSGenerator.class);
+ job.setMapperClass(MyMapper.class);
+ // use the default reducer
+ // job.setReducerClass(PetStoreTransactionGeneratorJob.Red.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(Text.class);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(Text.class);
+ job.setInputFormatClass(GeneratePetStoreTransactionsInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ FileOutputFormat.setOutputPath(job, output);
+ return job;
+ }
+
+ public static class MyMapper extends Mapper<Text, Text, Text, Text> {
+
+ @Override
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
+ super.setup(context);
+ }
+
+ protected void map(Text key, Text value, Context context)
+ throws java.io.IOException, InterruptedException {
+ context.write(key, value);
+ // TODO: Add multiple outputs here which writes mock addresses for
+ // generated users
+ // to a corresponding data file.
+ };
+ }
+
+ public static void main(String args[]) throws Exception {
+ if (args.length != 2) {
+ System.err.println("USAGE : [number of records] [output path]");
+ System.exit(0);
+ } else {
+ Configuration conf = new Configuration();
+ DeveloperTools.validate(
+ args,
+ "# of records",
+ "output path");
+
+ conf.setInt(
+ GeneratePetStoreTransactionsInputFormat.props.bigpetstore_records.name(),
+ Integer.parseInt(args[0]));
+ createJob(new Path(args[1]), conf).waitForCompletion(true);
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/GeneratePetStoreTransactionsInputFormat.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/GeneratePetStoreTransactionsInputFormat.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/GeneratePetStoreTransactionsInputFormat.java
new file mode 100755
index 0000000..a779428
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/GeneratePetStoreTransactionsInputFormat.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.generator;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.bigtop.bigpetstore.generator.TransactionIteratorFactory.KeyVal;
+import org.apache.bigtop.bigpetstore.generator.TransactionIteratorFactory.STATE;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+
+/**
+ * A simple input split that fakes input.
+ */
+public class GeneratePetStoreTransactionsInputFormat extends
+ FileInputFormat<Text, Text> {
+
+ @Override
+ public RecordReader<Text, Text> createRecordReader(
+ final InputSplit inputSplit, TaskAttemptContext arg1)
+ throws IOException, InterruptedException {
+ return new RecordReader<Text, Text>() {
+
+ @Override
+ public void close() throws IOException {
+
+ }
+
+ /**
+ * We need the "state" information to generate records. - Each state
+ * has a probability associated with it, so that our data set can be
+ * realistic (i.e. Colorado should have more transactions than rhode
+ * island).
+ *
+ * - Each state also will its name as part of the key.
+ *
+ * - This task would be distributed, for example, into 50 nodes on a
+ * real cluster, each creating the data for a given state.
+ */
+
+ // String storeCode = ((Split) inputSplit).storeCode;
+ int records = ((PetStoreTransactionInputSplit) inputSplit).records;
+ Iterator<KeyVal<String, String>> data = (new TransactionIteratorFactory(
+ records, ((PetStoreTransactionInputSplit) inputSplit).state))
+ .getData();
+ KeyVal<String, String> currentRecord;
+
+ @Override
+ public Text getCurrentKey() throws IOException,
+ InterruptedException {
+ return new Text(currentRecord.key);
+ }
+
+ @Override
+ public Text getCurrentValue() throws IOException,
+ InterruptedException {
+ return new Text(currentRecord.val);
+ }
+
+ @Override
+ public void initialize(InputSplit arg0, TaskAttemptContext arg1)
+ throws IOException, InterruptedException {
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException,
+ InterruptedException {
+ if (data.hasNext()) {
+ currentRecord = data.next();
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public float getProgress() throws IOException, InterruptedException {
+ return 0f;
+ }
+
+ };
+ }
+
+ public enum props {
+ // bigpetstore_splits,
+ bigpetstore_records
+ }
+
+ @Override
+ public List<InputSplit> getSplits(JobContext arg) throws IOException {
+ int num_records_desired = arg
+ .getConfiguration()
+ .getInt(GeneratePetStoreTransactionsInputFormat.props.bigpetstore_records
+ .name(), -1);
+ if (num_records_desired == -1) {
+ throw new RuntimeException(
+ "# of total records not set in configuration object: "
+ + arg.getConfiguration());
+ }
+
+ ArrayList<InputSplit> list = new ArrayList<InputSplit>();
+
+ /**
+ * Generator class will take a state as input and generate all the data
+ * for that state.
+ */
+ for (TransactionIteratorFactory.STATE s : STATE.values()) {
+ PetStoreTransactionInputSplit split = new PetStoreTransactionInputSplit(
+ (int) (Math.ceil(num_records_desired * s.probability)), s);
+ System.out.println(s + " _ " + split.records);
+ list.add(split);
+ }
+ return list;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransaction.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransaction.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransaction.java
new file mode 100755
index 0000000..71aa6d6
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransaction.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.generator;
+
+import java.util.Date;
+
+public interface PetStoreTransaction {
+
+ public String getFirstName();
+
+ public String getLastName();
+
+ public String getProduct();
+
+ public Date getDate();
+
+ public Integer getPrice();
+
+}
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransactionInputSplit.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransactionInputSplit.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransactionInputSplit.java
new file mode 100755
index 0000000..9b32344
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/PetStoreTransactionInputSplit.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.generator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.bigtop.bigpetstore.generator.TransactionIteratorFactory.STATE;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputSplit;
+
+/**
+ * What does an `InputSplit` actually do? From the Javadocs, it looks like ...
+ * absolutely nothing.
+ *
+ * Note: for some reason, you *have* to implement Writable, even if your methods
+ * do nothing, or you will got strange and un-debuggable null pointer
+ * exceptions.
+ */
+public class PetStoreTransactionInputSplit extends InputSplit implements
+ Writable {
+
+ public PetStoreTransactionInputSplit() {
+ }
+
+ public int records;
+ public STATE state;
+
+ public PetStoreTransactionInputSplit(int records, STATE state) {
+ this.records = records;
+ this.state = state;
+ }
+
+ public void readFields(DataInput arg0) throws IOException {
+ records = arg0.readInt();
+ state = STATE.valueOf(arg0.readUTF());
+ }
+
+ public void write(DataOutput arg0) throws IOException {
+ arg0.writeInt(records);
+ arg0.writeUTF(state.name());
+ }
+
+ @Override
+ public String[] getLocations() throws IOException, InterruptedException {
+ return new String[] {};
+ }
+
+ @Override
+ public long getLength() throws IOException, InterruptedException {
+ return 100;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/TransactionIteratorFactory.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/TransactionIteratorFactory.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/TransactionIteratorFactory.java
new file mode 100755
index 0000000..0ea81ee
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/generator/TransactionIteratorFactory.java
@@ -0,0 +1,468 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.generator;
+
+
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.bigtop.bigpetstore.util.Pair;
+import org.apache.bigtop.bigpetstore.util.StringUtils;
+
+/**
+ * This class generates our data. Over time we will use it to embed bias which
+ * can then be teased out, i.e. by clutstering/classifiers. For example:
+ *
+ * certain products <--> certain years or days
+ *
+ *
+ */
+public class TransactionIteratorFactory {
+
+ /**
+ * Each "state" has a pet store , with a certain "proportion" of the
+ * transactions. In this case colorado represents the majority of the
+ * transactions.
+ */
+
+ public static enum STATE {
+
+ // Each product is separated with an _ for its base price.
+ // That is just to make it easy to add new products.
+ // Each state is associated with a relative probability.
+ AZ(.1f, "dog-food_10", "cat-food_8", "leather-collar_25",
+ "snake-bite ointment_30", "turtle-food_11"),
+ AK(.1f,
+ "dog-food_10", "cat-food_8", "fuzzy-collar_19",
+ "antelope-caller_20", "salmon-bait_30"),
+ CT(.1f, "dog-food_10",
+ "cat-food_8", "fuzzy-collar_19", "turtle-pellets_5"),
+ OK(.1f,
+ "dog-food_10", "cat-food_8", "duck-caller_13",
+ "rodent-cage_40", "hay-bail_5", "cow-dung_2"),
+ CO(.1f,
+ "dog-food_10", "cat-food_8", "choke-collar_15",
+ "antelope snacks_30", "duck-caller_18"),
+ CA(.3f, "dog-food_10",
+ "cat-food_8", "fish-food_12", "organic-dog-food_16",
+ "turtle-pellets_5"),
+ NY(.2f, "dog-food_10", "cat-food_8", "steel-leash_20",
+ "fish-food_20", "seal-spray_25");
+
+ public static Random rand = new Random();
+ public float probability;
+ public String[] products;
+
+ private STATE(float probability, String... products) {
+ this.probability = probability;
+ this.products = products;
+ }
+
+ public Pair<String, Integer> randProduct() {
+ String product = products[rand.nextInt(products.length - 1)];
+ String name = StringUtils.substringBefore(product, "_");
+ Integer basePrice = Integer.parseInt(StringUtils.substringAfter(
+ product, "_"));
+ return new Pair(name, basePrice);
+ }
+
+ }
+
+ public static class KeyVal<K, V> {
+
+ public final K key;
+ public final V val;
+
+ public KeyVal(K key, V val) {
+ this.key = key;
+ this.val = val;
+ }
+ }
+
+ private Iterator<KeyVal<String, String>> dataIterator;
+
+ Random r;
+
+ public TransactionIteratorFactory(final int records, final STATE state) {
+
+ /**
+ * Random is seeded by STATE. This way similar names will be randomly
+ * selected for states .
+ */
+ r = new Random(state.hashCode());
+
+ if (records == 0) {
+ throw new RuntimeException(
+ "Cant create a data iterator with no records (records==0) !");
+ }
+
+ this.dataIterator = new Iterator<KeyVal<String, String>>() {
+ int trans_id = 1;
+
+ @Override
+ public boolean hasNext() {
+ // TODO Auto-generated method stub
+ return trans_id <= records;
+ }
+
+ int repeat = 0;
+ String fname = randFirstName();
+ String lname = randLastName();
+
+ @Override
+ public KeyVal<String, String> next() {
+ /**
+ * Some customers come back for more :) We repeat a name up to
+ * ten times.
+ */
+ if (repeat > 0)
+ repeat--;
+ else {
+ fname = randFirstName();
+ lname = randLastName();
+ repeat = (int) (r.nextGaussian() * 10f);
+ }
+ String key, val;
+ key = join(",", "BigPetStore", "storeCode_" + state.name(),
+ trans_id++ + "");
+ Pair<String, Integer> product_price = state.randProduct();
+ val = join(
+ ",",
+ fname,
+ lname,
+ getDate().toString(),
+ fudgePrice(product_price.getFirst(),
+ product_price.getSecond())
+ + "", product_price.getFirst()); // products are
+ // biased by
+ // state
+
+ return new KeyVal<String, String>(key, val);
+ }
+
+ @Override
+ public void remove() {
+ // TODO Auto-generated method stub
+
+ }
+
+ };
+ }
+
+ /**
+ * Add some decimals to the price;
+ *
+ * @param i
+ * @return
+ */
+ public Float fudgePrice(String product, Integer i) {
+ float f = (float) i;
+ if (product.contains("dog")) {
+ return i + .50f;
+ }
+ if (product.contains("cat")) {
+ return i - .50f;
+ }
+ if (product.contains("fish")) {
+ return i - .25f;
+ } else
+ return i + .10f;
+ }
+
+ static String join(String sep, String... strs) {
+ if (strs.length == 0) {
+ return "";
+ } else if (strs.length == 1) {
+ return strs[0];
+ }
+ String temp = strs[0]; // inefficient ... should probably use
+ // StringBuilder instead
+ for (int i = 1; i < strs.length; i++) {
+ temp += "," + strs[i];
+ }
+ return temp;
+ }
+
+ public Iterator<KeyVal<String, String>> getData() {
+ return this.dataIterator;
+ }
+
+ private String randFirstName() {
+ return FIRSTNAMES[this.r.nextInt(FIRSTNAMES.length - 1)].toLowerCase();
+ }
+
+ private String randLastName() {
+ return LASTNAMES[this.r.nextInt(LASTNAMES.length - 1)].toLowerCase();
+ }
+
+ private Date getDate() {
+ return new Date(this.r.nextInt());
+ }
+
+ private Integer getPrice() {
+ return this.r.nextInt(MAX_PRICE);
+ }
+
+ public static final Integer MINUTES_IN_DAY = 60 * 24;
+ public static final Integer MAX_PRICE = 10000;
+
+ private static String[] FIRSTNAMES = { "Aaron", "Abby", "Abigail", "Adam",
+ "Alan", "Albert", "Alex", "Alexandra", "Alexis", "Alice", "Alicia",
+ "Alisha", "Alissa", "Allen", "Allison", "Alyssa", "Amanda",
+ "Amber", "Amy", "Andrea", "Andrew", "Andy", "Angel", "Angela",
+ "Angie", "Anita", "Ann", "Anna", "Annette", "Anthony", "Antonio",
+ "April", "Arthur", "Ashley", "Audrey", "Austin", "Autumn", "Baby",
+ "Barb", "Barbara", "Becky", "Benjamin", "Beth", "Bethany", "Betty",
+ "Beverly", "Bill", "Billie", "Billy", "Blake", "Bob", "Bobbie",
+ "Bobby", "Bonnie", "Brad", "Bradley", "Brady", "Brandi", "Brandon",
+ "Brandy", "Breanna", "Brenda", "Brent", "Brett", "Brian",
+ "Brianna", "Brittany", "Brooke", "Brooklyn", "Bruce", "Bryan",
+ "Caleb", "Cameron", "Candy", "Carl", "Carla", "Carmen", "Carol",
+ "Carolyn", "Carrie", "Casey", "Cassandra", "Catherine", "Cathy",
+ "Chad", "Charlene", "Charles", "Charlie", "Charlotte", "Chase",
+ "Chasity", "Chastity", "Chelsea", "Cheryl", "Chester", "Cheyenne",
+ "Chris", "Christian", "Christina", "Christine", "Christoph",
+ "Christopher", "Christy", "Chuck", "Cindy", "Clara", "Clarence",
+ "Clayton", "Clifford", "Clint", "Cody", "Colton", "Connie",
+ "Corey", "Cory", "Courtney", "Craig", "Crystal", "Curtis",
+ "Cynthia", "Dakota", "Dale", "Dallas", "Dalton", "Dan", "Dana",
+ "Daniel", "Danielle", "Danny", "Darla", "Darlene", "Darrell",
+ "Darren", "Dave", "David", "Dawn", "Dean", "Deanna", "Debbie",
+ "Deborah", "Debra", "Denise", "Dennis", "Derek", "Derrick",
+ "Destiny", "Devin", "Diana", "Diane", "Dillon", "Dixie", "Dominic",
+ "Don", "Donald", "Donna", "Donnie", "Doris", "Dorothy", "Doug",
+ "Douglas", "Drew", "Duane", "Dustin", "Dusty", "Dylan", "Earl",
+ "Ed", "Eddie", "Edward", "Elaine", "Elizabeth", "Ellen", "Emily",
+ "Eric", "Erica", "Erika", "Erin", "Ernest", "Ethan", "Eugene",
+ "Eva", "Evelyn", "Everett", "Faith", "Father", "Felicia", "Floyd",
+ "Francis", "Frank", "Fred", "Gabriel", "Gage", "Gail", "Gary",
+ "Gene", "George", "Gerald", "Gina", "Ginger", "Glen", "Glenn",
+ "Gloria", "Grace", "Greg", "Gregory", "Haley", "Hannah", "Harley",
+ "Harold", "Harry", "Heath", "Heather", "Heidi", "Helen", "Herbert",
+ "Holly", "Hope", "Howard", "Hunter", "Ian", "Isaac", "Jack",
+ "Jackie", "Jacob", "Jade", "Jake", "James", "Jamie", "Jan", "Jane",
+ "Janet", "Janice", "Jared", "Jasmine", "Jason", "Jay", "Jean",
+ "Jeannie", "Jeff", "Jeffery", "Jeffrey", "Jenna", "Jennifer",
+ "Jenny", "Jeremiah", "Jeremy", "Jerry", "Jesse", "Jessica",
+ "Jessie", "Jill", "Jim", "Jimmy", "Joann", "Joanne", "Jodi",
+ "Jody", "Joe", "Joel", "Joey", "John", "Johnathan", "Johnny",
+ "Jon", "Jonathan", "Jonathon", "Jordan", "Joseph", "Josh",
+ "Joshua", "Joyce", "Juanita", "Judy", "Julia", "Julie", "Justin",
+ "Kaitlyn", "Karen", "Katelyn", "Katherine", "Kathleen", "Kathryn",
+ "Kathy", "Katie", "Katrina", "Kay", "Kayla", "Kaylee", "Keith",
+ "Kelly", "Kelsey", "Ken", "Kendra", "Kenneth", "Kenny", "Kevin",
+ "Kim", "Kimberly", "Kris", "Krista", "Kristen", "Kristin",
+ "Kristina", "Kristy", "Kyle", "Kylie", "Lacey", "Laken", "Lance",
+ "Larry", "Laura", "Lawrence", "Leah", "Lee", "Leonard", "Leroy",
+ "Leslie", "Levi", "Lewis", "Linda", "Lindsay", "Lindsey", "Lisa",
+ "Lloyd", "Logan", "Lois", "Loretta", "Lori", "Louis", "Lynn",
+ "Madison", "Mandy", "Marcus", "Margaret", "Maria", "Mariah",
+ "Marie", "Marilyn", "Marion", "Mark", "Marlene", "Marsha",
+ "Martha", "Martin", "Marty", "Marvin", "Mary", "Mary ann", "Mason",
+ "Matt", "Matthew", "Max", "Megan", "Melanie", "Melinda", "Melissa",
+ "Melody", "Michael", "Michelle", "Mickey", "Mike", "Mindy",
+ "Miranda", "Misty", "Mitchell", "Molly", "Monica", "Morgan",
+ "Mother", "Myron", "Nancy", "Natasha", "Nathan", "Nicholas",
+ "Nick", "Nicole", "Nina", "Noah", "Norma", "Norman", "Olivia",
+ "Paige", "Pam", "Pamela", "Pat", "Patricia", "Patrick", "Patty",
+ "Paul", "Paula", "Peggy", "Penny", "Pete", "Phillip", "Phyllis",
+ "Rachael", "Rachel", "Ralph", "Randall", "Randi", "Randy", "Ray",
+ "Raymond", "Rebecca", "Regina", "Renee", "Rex", "Rhonda",
+ "Richard", "Rick", "Ricky", "Rita", "Rob", "Robbie", "Robert",
+ "Roberta", "Robin", "Rochelle", "Rocky", "Rod", "Rodney", "Roger",
+ "Ron", "Ronald", "Ronda", "Ronnie", "Rose", "Roxanne", "Roy",
+ "Russ", "Russell", "Rusty", "Ruth", "Ryan", "Sabrina", "Sally",
+ "Sam", "Samantha", "Samuel", "Sandra", "Sandy", "Sara", "Sarah",
+ "Savannah", "Scott", "Sean", "Seth", "Shanda", "Shane", "Shanna",
+ "Shannon", "Sharon", "Shaun", "Shawn", "Shawna", "Sheila",
+ "Shelly", "Sher", "Sherri", "Sherry", "Shirley", "Sierra",
+ "Skyler", "Stacey", "Stacy", "Stanley", "Stephanie", "Stephen",
+ "Steve", "Steven", "Sue", "Summer", "Susan", "Sydney", "Tabatha",
+ "Tabitha", "Tamara", "Tammy", "Tara", "Tasha", "Tashia", "Taylor",
+ "Ted", "Teresa", "Terri", "Terry", "Tessa", "Thelma", "Theresa",
+ "Thomas", "Tia", "Tiffany", "Tim", "Timmy", "Timothy", "Tina",
+ "Todd", "Tom", "Tommy", "Toni", "Tony", "Tonya", "Tracey",
+ "Tracie", "Tracy", "Travis", "Trent", "Trevor", "Trey", "Trisha",
+ "Tristan", "Troy", "Tyler", "Tyrone", "Unborn", "Valerie",
+ "Vanessa", "Vernon", "Veronica", "Vicki", "Vickie", "Vicky",
+ "Victor", "Victoria", "Vincent", "Virginia", "Vivian", "Walter",
+ "Wanda", "Wayne", "Wendy", "Wesley", "Whitney", "William",
+ "Willie", "Wyatt", "Zachary" };
+
+ public static String[] LASTNAMES = { "Abbott", "Acevedo", "Acosta",
+ "Adams", "Adkins", "Aguilar", "Aguirre", "Albert", "Alexander",
+ "Alford", "Allen", "Allison", "Alston", "Alvarado", "Alvarez",
+ "Anderson", "Andrews", "Anthony", "Armstrong", "Arnold", "Ashley",
+ "Atkins", "Atkinson", "Austin", "Avery", "Avila", "Ayala", "Ayers",
+ "Bailey", "Baird", "Baker", "Baldwin", "Ball", "Ballard", "Banks",
+ "Barber", "Smith", "Johnson", "Williams", "Jones", "Brown",
+ "Davis", "Miller", "Wilson", "Moore", "Taylor", "Thomas",
+ "Jackson", "Barker", "Barlow", "Barnes", "Barnett", "Barr",
+ "Barrera", "Barrett", "Barron", "Barry", "Bartlett", "Barton",
+ "Bass", "Bates", "Battle", "Bauer", "Baxter", "Beach", "Bean",
+ "Beard", "Beasley", "Beck", "Becker", "Bell", "Bender", "Benjamin",
+ "Bennett", "Benson", "Bentley", "Benton", "Berg", "Berger",
+ "Bernard", "Berry", "Best", "Bird", "Bishop", "Black", "Blackburn",
+ "Blackwell", "Blair", "Blake", "Blanchard", "Blankenship",
+ "Blevins", "Bolton", "Bond", "Bonner", "Booker", "Boone", "Booth",
+ "Bowen", "Bowers", "Bowman", "Boyd", "Boyer", "Boyle", "Bradford",
+ "Bradley", "Bradshaw", "Brady", "Branch", "Bray", "Brennan",
+ "Brewer", "Bridges", "Briggs", "Bright", "Britt", "Brock",
+ "Brooks", "Browning", "Bruce", "Bryan", "Bryant", "Buchanan",
+ "Buck", "Buckley", "Buckner", "Bullock", "Burch", "Burgess",
+ "Burke", "Burks", "Burnett", "Burns", "Burris", "Burt", "Burton",
+ "Bush", "Butler", "Byers", "Byrd", "Cabrera", "Cain", "Calderon",
+ "Caldwell", "Calhoun", "Callahan", "Camacho", "Cameron",
+ "Campbell", "Campos", "Cannon", "Cantrell", "Cantu", "Cardenas",
+ "Carey", "Carlson", "Carney", "Carpenter", "Carr", "Carrillo",
+ "Carroll", "Carson", "Carter", "Carver", "Case", "Casey", "Cash",
+ "Castaneda", "Castillo", "Castro", "Cervantes", "Chambers", "Chan",
+ "Chandler", "Chaney", "Chang", "Chapman", "Charles", "Chase",
+ "Chavez", "Chen", "Cherry", "Christensen", "Christian", "Church",
+ "Clark", "Clarke", "Clay", "Clayton", "Clements", "Clemons",
+ "Cleveland", "Cline", "Cobb", "Cochran", "Coffey", "Cohen", "Cole",
+ "Coleman", "Collier", "Collins", "Colon", "Combs", "Compton",
+ "Conley", "Conner", "Conrad", "Contreras", "Conway", "Cook",
+ "Cooke", "Cooley", "Cooper", "Copeland", "Cortez", "Cote",
+ "Cotton", "Cox", "Craft", "Craig", "Crane", "Crawford", "Crosby",
+ "Cross", "Cruz", "Cummings", "Cunningham", "Curry", "Curtis",
+ "Dale", "Dalton", "Daniel", "Daniels", "Daugherty", "Davenport",
+ "David", "Davidson", "Dawson", "Day", "Dean", "Decker", "Dejesus",
+ "Delacruz", "Delaney", "Deleon", "Delgado", "Dennis", "Diaz",
+ "Dickerson", "Dickinson", "Dillard", "Dillon", "Dixon", "Dodson",
+ "Dominguez", "Donaldson", "Donovan", "Dorsey", "Dotson", "Douglas",
+ "Downs", "Doyle", "Drake", "Dudley", "Duffy", "Duke", "Duncan",
+ "Dunlap", "Dunn", "Duran", "Durham", "Dyer", "Eaton", "Edwards",
+ "Elliott", "Ellis", "Ellison", "Emerson", "England", "English",
+ "Erickson", "Espinoza", "Estes", "Estrada", "Evans", "Everett",
+ "Ewing", "Farley", "Farmer", "Farrell", "Faulkner", "Ferguson",
+ "Fernandez", "Ferrell", "Fields", "Figueroa", "Finch", "Finley",
+ "Fischer", "Fisher", "Fitzgerald", "Fitzpatrick", "Fleming",
+ "Fletcher", "Flores", "Flowers", "Floyd", "Flynn", "Foley",
+ "Forbes", "Ford", "Foreman", "Foster", "Fowler", "Fox", "Francis",
+ "Franco", "Frank", "Franklin", "Franks", "Frazier", "Frederick",
+ "Freeman", "French", "Frost", "Fry", "Frye", "Fuentes", "Fuller",
+ "Fulton", "Gaines", "Gallagher", "Gallegos", "Galloway", "Gamble",
+ "Garcia", "Gardner", "Garner", "Garrett", "Garrison", "Garza",
+ "Gates", "Gay", "Gentry", "George", "Gibbs", "Gibson", "Gilbert",
+ "Giles", "Gill", "Gillespie", "Gilliam", "Gilmore", "Glass",
+ "Glenn", "Glover", "Goff", "Golden", "Gomez", "Gonzales",
+ "Gonzalez", "Good", "Goodman", "Goodwin", "Gordon", "Gould",
+ "Graham", "Grant", "Graves", "Gray", "Green", "Greene", "Greer",
+ "Gregory", "Griffin", "Griffith", "Grimes", "Gross", "Guerra",
+ "Guerrero", "Guthrie", "Gutierrez", "Guy", "Guzman", "Hahn",
+ "Hale", "Haley", "Hall", "Hamilton", "Hammond", "Hampton",
+ "Hancock", "Haney", "Hansen", "Hanson", "Hardin", "Harding",
+ "Hardy", "Harmon", "Harper", "Harris", "Harrington", "Harrison",
+ "Hart", "Hartman", "Harvey", "Hatfield", "Hawkins", "Hayden",
+ "Hayes", "Haynes", "Hays", "Head", "Heath", "Hebert", "Henderson",
+ "Hendricks", "Hendrix", "Henry", "Hensley", "Henson", "Herman",
+ "Hernandez", "Herrera", "Herring", "Hess", "Hester", "Hewitt",
+ "Hickman", "Hicks", "Higgins", "Hill", "Hines", "Hinton", "Hobbs",
+ "Hodge", "Hodges", "Hoffman", "Hogan", "Holcomb", "Holden",
+ "Holder", "Holland", "Holloway", "Holman", "Holmes", "Holt",
+ "Hood", "Hooper", "Hoover", "Hopkins", "Hopper", "Horn", "Horne",
+ "Horton", "House", "Houston", "Howard", "Howe", "Howell",
+ "Hubbard", "Huber", "Hudson", "Huff", "Huffman", "Hughes", "Hull",
+ "Humphrey", "Hunt", "Hunter", "Hurley", "Hurst", "Hutchinson",
+ "Hyde", "Ingram", "Irwin", "Jacobs", "Jacobson", "James", "Jarvis",
+ "Jefferson", "Jenkins", "Jennings", "Jensen", "Jimenez", "Johns",
+ "Johnston", "Jordan", "Joseph", "Joyce", "Joyner", "Juarez",
+ "Justice", "Kane", "Kaufman", "Keith", "Keller", "Kelley", "Kelly",
+ "Kemp", "Kennedy", "Kent", "Kerr", "Key", "Kidd", "Kim", "King",
+ "Kinney", "Kirby", "Kirk", "Kirkland", "Klein", "Kline", "Knapp",
+ "Knight", "Knowles", "Knox", "Koch", "Kramer", "Lamb", "Lambert",
+ "Lancaster", "Landry", "Lane", "Lang", "Langley", "Lara", "Larsen",
+ "Larson", "Lawrence", "Lawson", "Le", "Leach", "Leblanc", "Lee",
+ "Leon", "Leonard", "Lester", "Levine", "Levy", "Lewis", "Lindsay",
+ "Lindsey", "Little", "Livingston", "Lloyd", "Logan", "Long",
+ "Lopez", "Lott", "Love", "Lowe", "Lowery", "Lucas", "Luna",
+ "Lynch", "Lynn", "Lyons", "Macdonald", "Macias", "Mack", "Madden",
+ "Maddox", "Maldonado", "Malone", "Mann", "Manning", "Marks",
+ "Marquez", "Marsh", "Marshall", "Martin", "Martinez", "Mason",
+ "Massey", "Mathews", "Mathis", "Matthews", "Maxwell", "May",
+ "Mayer", "Maynard", "Mayo", "Mays", "McBride", "McCall",
+ "McCarthy", "McCarty", "McClain", "McClure", "McConnell",
+ "McCormick", "McCoy", "McCray", "McCullough", "McDaniel",
+ "McDonald", "McDowell", "McFadden", "McFarland", "McGee",
+ "McGowan", "McGuire", "McIntosh", "McIntyre", "McKay", "McKee",
+ "McKenzie", "McKinney", "McKnight", "McLaughlin", "McLean",
+ "McLeod", "McMahon", "McMillan", "McNeil", "McPherson", "Meadows",
+ "Medina", "Mejia", "Melendez", "Melton", "Mendez", "Mendoza",
+ "Mercado", "Mercer", "Merrill", "Merritt", "Meyer", "Meyers",
+ "Michael", "Middleton", "Miles", "Mills", "Miranda", "Mitchell",
+ "Molina", "Monroe", "Montgomery", "Montoya", "Moody", "Moon",
+ "Mooney", "Morales", "Moran", "Moreno", "Morgan", "Morin",
+ "Morris", "Morrison", "Morrow", "Morse", "Morton", "Moses",
+ "Mosley", "Moss", "Mueller", "Mullen", "Mullins", "Munoz",
+ "Murphy", "Murray", "Myers", "Nash", "Navarro", "Neal", "Nelson",
+ "Newman", "Newton", "Nguyen", "Nichols", "Nicholson", "Nielsen",
+ "Nieves", "Nixon", "Noble", "Noel", "Nolan", "Norman", "Norris",
+ "Norton", "Nunez", "Obrien", "Ochoa", "Oconnor", "Odom",
+ "Odonnell", "Oliver", "Olsen", "Olson", "O'neal", "O'neil",
+ "O'neill", "Orr", "Ortega", "Ortiz", "Osborn", "Osborne", "Owen",
+ "Owens", "Pace", "Pacheco", "Padilla", "Page", "Palmer", "Park",
+ "Parker", "Parks", "Parrish", "Parsons", "Pate", "Patel",
+ "Patrick", "Patterson", "Patton", "Paul", "Payne", "Pearson",
+ "Peck", "Pena", "Pennington", "Perez", "Perkins", "Perry",
+ "Peters", "Petersen", "Peterson", "Petty", "Phelps", "Phillips",
+ "Pickett", "Pierce", "Pittman", "Pitts", "Pollard", "Poole",
+ "Pope", "Porter", "Potter", "Potts", "Powell", "Powers", "Pratt",
+ "Preston", "Price", "Prince", "Pruitt", "Puckett", "Pugh", "Quinn",
+ "Ramirez", "Ramos", "Ramsey", "Randall", "Randolph", "Rasmussen",
+ "Ratliff", "Ray", "Raymond", "Reed", "Reese", "Reeves", "Reid",
+ "Reilly", "Reyes", "Reynolds", "Rhodes", "Rice", "Rich", "Richard",
+ "Richards", "Richardson", "Richmond", "Riddle", "Riggs", "Riley",
+ "Rios", "Rivas", "Rivera", "Rivers", "Roach", "Robbins",
+ "Roberson", "Roberts", "Robertson", "Robinson", "Robles", "Rocha",
+ "Rodgers", "Rodriguez", "Rodriquez", "Rogers", "Rojas", "Rollins",
+ "Roman", "Romero", "Rosa", "Rosales", "Rosario", "Rose", "Ross",
+ "Roth", "Rowe", "Rowland", "Roy", "Ruiz", "Rush", "Russell",
+ "Russo", "Rutledge", "Ryan", "Salas", "Salazar", "Salinas",
+ "Sampson", "Sanchez", "Sanders", "Sandoval", "Sanford", "Santana",
+ "Santiago", "Santos", "Sargent", "Saunders", "Savage", "Sawyer",
+ "Schmidt", "Schneider", "Schroeder", "Schultz", "Schwartz",
+ "Scott", "Sears", "Sellers", "Serrano", "Sexton", "Shaffer",
+ "Shannon", "Sharp", "Sharpe", "Shaw", "Shelton", "Shepard",
+ "Shepherd", "Sheppard", "Sherman", "Shields", "Short", "Silva",
+ "Simmons", "Simon", "Simpson", "Sims", "Singleton", "Skinner",
+ "Slater", "Sloan", "Small", "Snider", "Snow", "Snyder", "Solis",
+ "Solomon", "Sosa", "Soto", "Sparks", "Spears", "Spence", "Spencer",
+ "Stafford", "Stanley", "Stanton", "Stark", "Steele", "Stein",
+ "Stephens", "Stephenson", "Stevens", "Stevenson", "Stewart",
+ "Stokes", "Stone", "Stout", "Strickland", "Strong", "Stuart",
+ "Suarez", "Sullivan", "Summers", "Sutton", "Swanson", "Sweeney",
+ "Sweet", "Sykes", "Talley", "Tanner", "Tate", "Terrell", "Terry",
+ "Thompson", "Thornton", "Tillman", "Todd", "Torres", "Townsend",
+ "Tran", "Travis", "Trevino", "Trujillo", "Tucker", "Turner",
+ "Tyler", "Tyson", "Underwood", "Valdez", "Valencia", "Valentine",
+ "Valenzuela", "Vance", "Vang", "Vargas", "Vasquez", "Vaughan",
+ "Vaughn", "Vazquez", "Vega", "Velasquez", "Velazquez", "Velez",
+ "Van halen", "Vincent", "Vinson", "Wade", "Wagner", "Walker",
+ "Wall", "Wallace", "Waller", "Walls", "Walsh", "Walter", "Walters",
+ "Walton", "Ward", "Ware", "Warner", "Warren", "Washington",
+ "Waters", "Watkins", "Watson", "Watts", "Weaver", "Webb", "Weber",
+ "Webster", "Weeks", "Weiss", "Welch", "Wells", "West", "Wheeler",
+ "Whitaker", "White", "Whitehead", "Whitfield", "Whitley",
+ "Whitney", "Wiggins", "Wilcox", "Wilder", "Wiley", "Wilkerson",
+ "Wilkins", "Wilkinson", "William", "Williamson", "Willis",
+ "Winters", "Wise", "Witt", "Wolf", "Wolfe", "Wong", "Wood",
+ "Woodard", "Woods", "Woodward", "Wooten", "Workman", "Wright",
+ "Wyatt", "Wynn", "Yang", "Yates", "York", "Young", "Zamora",
+ "Zimmerman"
+ };
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/BigPetStoreConstants.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/BigPetStoreConstants.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/BigPetStoreConstants.java
new file mode 100755
index 0000000..29f7c67
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/BigPetStoreConstants.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Static final constants
+ *
+ * is useful to have the basic sql here as the HIVE SQL can vary between hive
+ * versions if updated here will update everywhere
+ */
+
+package org.apache.bigtop.bigpetstore.util;
+
+public class BigPetStoreConstants {
+
+ //Files should be stored in graphviz arch.dot
+ public enum OUTPUTS{
+ generated,//generator
+ cleaned,//pig
+ pig_ad_hoc_script,
+ MAHOUT_CF_IN,//hive view over data for mahout
+ MAHOUT_CF_OUT,//mahout cf results
+ CUSTOMER_PAGE//crunchhh
+ };
+
+}
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/DeveloperTools.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/DeveloperTools.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/DeveloperTools.java
new file mode 100755
index 0000000..9c2d684
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/DeveloperTools.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.util;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.mapreduce.Job;
+
+/**
+ * Dev utilities for testing arguments etc...
+ */
+public class DeveloperTools {
+
+ /**
+ * Validates that the expected args are present in the "args" array.
+ * Just some syntactic sugar for good arg error handling.
+ * @param args
+ * @param expected arguments.
+ */
+ public static void validate(String[] args, String... expected) {
+ int i=-1;
+ try{
+ for(i = 0 ; i < expected.length ; i++) {
+ System.out.println("VALUE OF " + expected[i] + " = " + args[i]);
+ }
+ }
+ catch(Throwable t) {
+ System.out.println("Argument " + i + " not available.");
+ System.out.println("We expected " + expected.length + " arguments for this phase");
+ }
+
+
+ }
+ public static void main(String[] args) throws Exception {
+ Log LOG = LogFactory.getLog(Job.class);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/NumericalIdUtils.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/NumericalIdUtils.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/NumericalIdUtils.java
new file mode 100644
index 0000000..9fa9455
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/NumericalIdUtils.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.bigpetstore.util;
+
+import java.math.BigInteger;
+
+import org.apache.bigtop.bigpetstore.generator.TransactionIteratorFactory.STATE;
+
+/**
+ * User and Product IDs need numerical
+ * identifiers for recommender algorithms
+ * which attempt to interpolate new
+ * products.
+ *
+ * TODO: Delete this class. Its not necessarily required: We might just use HIVE HASH() as our
+ * standard for this.
+ */
+public class NumericalIdUtils {
+
+ /**
+ * People: Leading with ordinal code for state.
+ */
+ public static long toId(STATE state, String name){
+ String fromRawData =
+ state==null?
+ name:
+ (state.name()+"_"+name);
+ return fromRawData.hashCode();
+ }
+ /**
+ * People: Leading with ordinal code for state.
+ */
+ public static long toId(String name){
+ return toId(null,name);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/Pair.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/Pair.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/Pair.java
new file mode 100644
index 0000000..a96fa44
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/Pair.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.bigpetstore.util;
+
+import org.apache.bigtop.bigpetstore.generator.TransactionIteratorFactory;
+
+import java.util.Comparator;
+
+@Deprecated
+public class Pair<S, T> implements Comparable<Pair<S, T>> {
+
+ private final S first;
+ private final T second;
+
+ public Pair(final S car, final T cdr) {
+ first = car;
+ second = cdr;
+ }
+
+ public S getFirst() { return first; }
+ public T getSecond() { return second; }
+
+ @Override
+ public boolean equals(Object o) {
+ if (null == o) {
+ return false;
+ } else if (o instanceof Pair) {
+ Pair<S, T> p = (Pair<S, T>) o;
+ if (first == null && second == null) {
+ return p.first == null && p.second == null;
+ } else if (first == null) {
+ return p.first == null && second.equals(p.second);
+ } else if (second == null) {
+ return p.second == null && first.equals(p.first);
+ } else {
+ return first.equals(p.first) && second.equals(p.second);
+ }
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ int code = 0;
+
+ if (null != first) {
+ code += first.hashCode();
+ }
+
+ if (null != second) {
+ code += second.hashCode() << 1;
+ }
+
+ return code;
+ }
+
+ @Override
+ public int compareTo(Pair<S, T> p) {
+ if (null == p) {
+ return 1;
+ }
+
+ Comparable<S> firstCompare = (Comparable<S>) first;
+
+ int firstResult = firstCompare.compareTo(p.first);
+ if (firstResult == 0) {
+ Comparable<T> secondCompare = (Comparable<T>) second;
+ return secondCompare.compareTo(p.second);
+ } else {
+ return firstResult;
+ }
+ }
+
+ // TODO: Can this be made static? Same with SecondElemComparator?
+ public class FirstElemComparator implements Comparator<Pair<S, T>> {
+ public FirstElemComparator() {
+ }
+
+ public int compare(Pair<S, T> p1, Pair<S, T> p2) {
+ Comparable<S> cS = (Comparable<S>) p1.first;
+ return cS.compareTo(p2.first);
+ }
+ }
+
+ public class SecondElemComparator implements Comparator<Pair<S, T>> {
+ public SecondElemComparator() {
+ }
+
+ public int compare(Pair<S, T> p1, Pair<S, T> p2) {
+ Comparable<T> cT = (Comparable<T>) p1.second;
+ return cT.compareTo(p2.second);
+ }
+ }
+
+ @Override
+ public String toString() {
+ String firstString = "null";
+ String secondString = "null";
+
+ if (null != first) {
+ firstString = first.toString();
+ }
+
+ if (null != second) {
+ secondString = second.toString();
+ }
+
+ return "(" + firstString + ", " + secondString + ")";
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/PetStoreParseFunctions.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/PetStoreParseFunctions.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/PetStoreParseFunctions.java
new file mode 100755
index 0000000..7b6bede
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/PetStoreParseFunctions.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.bigpetstore.util;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * TODO: This might be dead code.
+ */
+public class PetStoreParseFunctions {
+
+ String[] headers = { "code", "city", "country", "lat", "lon" };
+
+ public Map<String, Object> parse(String line) {
+
+ Map<String, Object> resultMap = new HashMap<String, Object>();
+
+ List<String> csvObj = null;
+
+ String[] temp = line.split(",");
+ csvObj = new ArrayList<String>(Arrays.asList(temp));
+
+ if (csvObj.isEmpty()) {
+ return resultMap;
+ }
+
+ int k = 0;
+
+ for (String valueStr : csvObj) {
+
+ resultMap.put(headers[k++], valueStr);
+
+ }
+
+ return resultMap;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/StringUtils.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/StringUtils.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/StringUtils.java
new file mode 100644
index 0000000..02399bf
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/util/StringUtils.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.util;
+
+import java.util.ArrayList;
+
+/**
+********************************************************************
+* Borrowed from apache-commons-lang StringUtils, overtime we might
+* add more elements here .
+* To maintain minor dependencies on a cluster sometimes this is easier
+* jar's manually in the hadoop classpath or via DistributedCache.
+********************************************************************/
+
+public class StringUtils {
+
+ public static String substringBefore(String str, String separator) {
+ int pos = str.indexOf(separator);
+ if (pos == -1) {
+ return str;
+ }
+ return str.substring(0, pos);
+ }
+
+
+ public static String substringAfter(String str, String separator) {
+ if (str.length()==0) {
+ return str;
+ }
+ if (separator == null) {
+ return "";
+ }
+ int pos = str.indexOf(separator);
+ if (pos == -1) {
+ return "";
+ }
+ return str.substring(pos + separator.length());
+ }
+ }
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/resources/hive-log4j.properties
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/resources/hive-log4j.properties b/bigtop-bigpetstore/src/main/resources/hive-log4j.properties
new file mode 100755
index 0000000..9236008
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/resources/hive-log4j.properties
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Define some default values that can be overridden by system properties
+hive.log.threshold=ERROR
+hive.root.logger=ERROR,DRFA
+hive.log.dir=/tmp/${user.name}
+hive.log.file=hive.log
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hive.root.logger}, EventCounter, console
+
+# Logging Threshold
+log4j.threshold=${hive.log.threshold}
+
+#
+# Daily Rolling File Appender
+#
+# Use the PidDailyerRollingFileAppend class instead if you want to use separate log files
+# for different CLI session.
+#
+# log4j.appender.DRFA=org.apache.hadoop.hive.ql.log.PidDailyRollingFileAppender
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+
+log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+log4j.appender.console.encoding=UTF-8
+
+#custom logging levels
+#log4j.logger.xxx=DEBUG
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
+
+
+log4j.category.DataNucleus=OFF
+log4j.category.Datastore=OFF
+log4j.category.Datastore.Schema=OFF
+log4j.category.JPOX.Datastore=OFF
+log4j.category.JPOX.Plugin=OFF
+log4j.category.JPOX.MetaData=OFF
+log4j.category.JPOX.Query=OFF
+log4j.category.JPOX.General=OFF
+log4j.category.JPOX.Enhancer=OFF
+
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/resources/hive-site.xml
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/resources/hive-site.xml b/bigtop-bigpetstore/src/main/resources/hive-site.xml
new file mode 100644
index 0000000..dd96f32
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/resources/hive-site.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+
+<!-- Hive Configuration can either be stored in this file or in the hadoop configuration files -->
+<!-- that are implied by Hadoop setup variables. -->
+<!-- Aside from Hadoop setup variables - this file is provided as a convenience so that Hive -->
+<!-- users do not have to edit hadoop configuration files (that may be managed as a centralized -->
+<!-- resource). -->
+
+<!-- Hive Execution Parameters -->
+
+<property>
+ <name>javax.jdo.option.ConnectionURL</name>
+ <!-- value>jdbc:derby:;databaseName=/var/lib/hive/metastore/metastore_db;create=true</value -->
+ <value>jdbc:derby:;databaseName=/tmp/metastore/metastore_db;create=true</value>
+ <description>JDBC connect string for a JDBC metastore</description>
+</property>
+
+<property>
+ <name>hive.metastore.warehouse.dir</name>
+ <value>/tmp</value>
+ <description>Driver class name for a JDBC metastore</description>
+</property>
+
+
+<property>
+ <name>javax.jdo.option.ConnectionDriverName</name>
+ <value>org.apache.derby.jdbc.EmbeddedDriver</value>
+ <description>Driver class name for a JDBC metastore</description>
+</property>
+
+
+</configuration>
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/docs/TestDocs.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/docs/TestDocs.java b/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/docs/TestDocs.java
new file mode 100644
index 0000000..883bb55
--- /dev/null
+++ b/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/docs/TestDocs.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.docs;
+
+import java.io.File;
+
+import junit.framework.Assert;
+
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants.OUTPUTS;
+import org.apache.commons.io.FileUtils;
+import org.junit.Test;
+
+public class TestDocs {
+
+ @Test
+ public void testGraphViz() throws Exception{
+ //test the graphviz file
+ //by grepping out the constants.
+ String graphviz=FileUtils.readFileToString(new File("arch.dot"));
+ System.out.println(graphviz);
+
+ org.junit.Assert.assertTrue(
+ graphviz.contains(
+ OUTPUTS.generated.name()));
+
+ org.junit.Assert.assertTrue(
+ graphviz.contains(
+ OUTPUTS.cleaned.name()));
+
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestNumericalIdUtils.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestNumericalIdUtils.java b/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestNumericalIdUtils.java
new file mode 100644
index 0000000..c68d471
--- /dev/null
+++ b/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestNumericalIdUtils.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.generator;
+
+import junit.framework.Assert;
+
+import org.apache.bigtop.bigpetstore.generator.TransactionIteratorFactory.STATE;
+import org.apache.bigtop.bigpetstore.util.NumericalIdUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+public class TestNumericalIdUtils {
+
+ @Test
+ public void testName() {
+ String strId= STATE.OK.name()+"_"+ "jay vyas";
+ long id = NumericalIdUtils.toId(strId);
+ String strId2= STATE.CO.name()+"_"+ "jay vyas";
+ long id2 = NumericalIdUtils.toId(strId2);
+ System.out.println(id + " " + id2);
+ Assert.assertFalse(id==id2);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestPetStoreTransactionGeneratorJob.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestPetStoreTransactionGeneratorJob.java b/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestPetStoreTransactionGeneratorJob.java
new file mode 100755
index 0000000..d1a60b3
--- /dev/null
+++ b/bigtop-bigpetstore/src/test/java/org/apache/bigtop/bigpetstore/generator/TestPetStoreTransactionGeneratorJob.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.generator;
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.InputStreamReader;
+import java.lang.management.ManagementFactory;
+import java.util.Date;
+
+import junit.framework.Assert;
+
+import org.apache.bigtop.bigpetstore.generator.BPSGenerator;
+import org.apache.bigtop.bigpetstore.generator.BPSGenerator.props;
+import org.apache.bigtop.bigpetstore.generator.TransactionIteratorFactory.STATE;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * run this test with vm options -XX:MaxPermSize=256m -Xms512m -Xmx1024m
+ *
+ */
+public class TestPetStoreTransactionGeneratorJob {
+
+ final static Logger log = LoggerFactory
+ .getLogger(TestPetStoreTransactionGeneratorJob.class);
+
+ @Test
+ public void test() throws Exception {
+
+ System.out.println("memory : " + Runtime.getRuntime().freeMemory()
+ / 1000000);
+ if (Runtime.getRuntime().freeMemory() / 1000000 < 75) {
+ // throw new
+ // RuntimeException("need more memory to run this test !");
+ }
+ int records = 20;
+ /**
+ * Setup configuration with prop.
+ */
+ Configuration c = new Configuration();
+ c.setInt(props.bigpetstore_records.name(), records);
+
+ /**
+ * Run the job
+ */
+ Path output = new Path("petstoredata/" + (new Date()).toString());
+ Job createInput = BPSGenerator.createJob(output, c);
+ createInput.submit();
+ System.out.println(createInput);
+ createInput.waitForCompletion(true);
+
+ FileSystem fs = FileSystem.getLocal(new Configuration());
+
+ /**
+ * Read file output into string.
+ */
+ DataInputStream f = fs.open(new Path(output, "part-r-00000"));
+ BufferedReader br = new BufferedReader(new InputStreamReader(f));
+ String s;
+ int recordsSeen = 0;
+ boolean CTseen = false;
+ boolean AZseen = false;
+
+ // confirm that both CT and AZ are seen in the outputs.
+ while (br.ready()) {
+ s = br.readLine();
+ System.out.println("===>" + s);
+ recordsSeen++;
+ if (s.contains(STATE.CT.name())) {
+ CTseen = true;
+ }
+ if (s.contains(STATE.AZ.name())) {
+ AZseen = true;
+ }
+ }
+
+ // records seen should = 20
+ Assert.assertEquals(records, recordsSeen);
+ // Assert that a couple of the states are seen (todo make it
+ // comprehensive for all states).
+ Assert.assertTrue(CTseen);
+ Assert.assertTrue(AZseen);
+ log.info("Created " + records + " , file was "
+ + fs.getFileStatus(new Path(output, "part-r-00000")).getLen()
+ + " bytes.");
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/test/resources/log4j.properties b/bigtop-bigpetstore/src/test/resources/log4j.properties
new file mode 100644
index 0000000..1e33093
--- /dev/null
+++ b/bigtop-bigpetstore/src/test/resources/log4j.properties
@@ -0,0 +1,47 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+hadoop.root.logger=INFO,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+#
+# Job Summary Appender
+#
+# Use following logger to send summary to separate file defined by
+# hadoop.mapreduce.jobsummary.log.file rolled daily:
+# hadoop.mapreduce.jobsummary.logger=INFO,JSA
+#
+hadoop.mapreduce.jobsummary.logger=${hadoop.root.logger}
+hadoop.mapreduce.jobsummary.log.file=hadoop-mapreduce.jobsummary.log
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.EventCounter=org.apache.log4j.ConsoleAppender
+log4j.appender.EventCounter.layout=org.apache.log4j.PatternLayout
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+# Logging Threshold
+log4j.threshold=ALL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
[2/2] git commit: BigPetStore - initial code drop - (lines commited
by others : nigelsavage~200, mattfenwick~200, michaelmcune~5, anushshetty~10,
jeffvance~10)
Posted by ma...@apache.org.
BigPetStore - initial code drop - (lines commited by others : nigelsavage~200,mattfenwick~200,michaelmcune~5,anushshetty~10,jeffvance~10)
Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/d3da8ceb
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/d3da8ceb
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/d3da8ceb
Branch: refs/heads/master
Commit: d3da8ceb165ea8692a5432224ef4b116476498be
Parents: 3298063
Author: jayunit100 <ja...@gmail.com>
Authored: Mon Apr 14 21:28:45 2014 -0400
Committer: Sean Mackrory <ma...@apache.org>
Committed: Tue Apr 15 18:29:53 2014 -0600
----------------------------------------------------------------------
bigtop-bigpetstore/BPS_analytics.pig | 77 ++
bigtop-bigpetstore/README.md | 140 ++++
bigtop-bigpetstore/arch.dot | 44 +
bigtop-bigpetstore/pom.xml | 797 +++++++++++++++++++
bigtop-bigpetstore/setuphive.sh | 22 +
.../bigtop/bigpetstore/BigPetStoreHiveIT.java | 108 +++
.../bigtop/bigpetstore/BigPetStoreMahoutIT.java | 88 ++
.../bigtop/bigpetstore/BigPetStorePigIT.java | 165 ++++
.../org/apache/bigtop/bigpetstore/ITUtils.java | 145 ++++
.../bigpetstore/clustering/BPSRecommnder.java | 83 ++
.../contract/PetStoreStatistics.java | 34 +
.../bigtop/bigpetstore/etl/CrunchETL.java | 142 ++++
.../bigtop/bigpetstore/etl/HiveViewCreator.java | 157 ++++
.../apache/bigtop/bigpetstore/etl/LineItem.java | 112 +++
.../bigtop/bigpetstore/etl/PigCSVCleaner.java | 171 ++++
.../bigpetstore/generator/BPSGenerator.java | 116 +++
...GeneratePetStoreTransactionsInputFormat.java | 134 ++++
.../generator/PetStoreTransaction.java | 32 +
.../PetStoreTransactionInputSplit.java | 67 ++
.../generator/TransactionIteratorFactory.java | 468 +++++++++++
.../bigpetstore/util/BigPetStoreConstants.java | 36 +
.../bigtop/bigpetstore/util/DeveloperTools.java | 58 ++
.../bigpetstore/util/NumericalIdUtils.java | 50 ++
.../apache/bigtop/bigpetstore/util/Pair.java | 125 +++
.../util/PetStoreParseFunctions.java | 55 ++
.../bigtop/bigpetstore/util/StringUtils.java | 53 ++
.../src/main/resources/hive-log4j.properties | 84 ++
.../src/main/resources/hive-site.xml | 36 +
.../bigtop/bigpetstore/docs/TestDocs.java | 46 ++
.../generator/TestNumericalIdUtils.java | 36 +
.../TestPetStoreTransactionGeneratorJob.java | 106 +++
.../src/test/resources/log4j.properties | 47 ++
32 files changed, 3834 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/BPS_analytics.pig
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/BPS_analytics.pig b/bigtop-bigpetstore/BPS_analytics.pig
new file mode 100755
index 0000000..44ed541
--- /dev/null
+++ b/bigtop-bigpetstore/BPS_analytics.pig
@@ -0,0 +1,77 @@
+----------------------------------------------------------------------------
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+-----------------------------------------------------------------------------
+
+-- This is the analytics script that BigPetStore uses as an example for
+-- demos of how to do ad-hoc analytics on the cleaned transaction data.
+-- It is used in conjunction with the big pet store web app, soon to be
+-- added to apache bigtop (As of 4/12/2014, the
+-- corresponding web app to consume this scripts output is
+-- in jayunit100.github.io/bigpetstore).
+
+-- invoke with two arguments, the input file , and the output file. -input /bps/gen -output /bps/analytics
+
+-- FYI...
+-- If you run into errors, you can see them in
+-- ./target/failsafe-reports/TEST-org.bigtop.bigpetstore.integration.BigPetStorePigIT.xml
+
+-- First , we load data in from a file, as tuples.
+-- in pig, relations like tables in a relational database
+-- so each relation is just a bunch of tuples.
+-- in this case csvdata will be a relation,
+-- where each tuple is a single petstore transaction.
+csvdata =
+ LOAD '$input' using PigStorage()
+ AS (
+ dump:chararray,
+ state:chararray,
+ transaction:int,
+ fname:chararray,
+ lname:chararray,
+ date:chararray,
+ price:float,
+ product:chararray);
+
+-- RESULT:
+-- (BigPetStore,storeCode_AK,1,jay,guy,Thu Dec 18 12:17:10 EST 1969,10.5,dog-food)
+-- ...
+
+-- Okay! Now lets group our data so we can do some stats.
+-- lets create a new relation,
+-- where each tuple will contain all transactions for a product in a state.
+
+state_product = group csvdata by ( state, product ) ;
+
+-- RESULT
+-- ((storeCode_AK,dog-food) , {(BigPetStore,storeCode_AK,1,jay,guy,Thu Dec 18 12:17:10 EST 1969,10.5,dog-food)}) --
+-- ...
+
+
+-- Okay now lets make some summary stats so that the boss man can
+-- decide which products are hottest in which states.
+
+-- Note that for the "groups", we tease out each individual field here for formatting with
+-- the BigPetStore visualization app.
+summary1 = FOREACH state_product generate STRSPLIT(group.state,'_').$1 as sp, group.product, COUNT($1);
+
+
+-- Okay, the stats look like this. Lets clean them up.
+-- (storeCode_AK,cat-food) 2530
+-- (storeCode_AK,dog-food) 2540
+-- (storeCode_AK,fuzzy-collar) 2495
+
+dump summary1;
+
+store summary1 into '$output';
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/README.md
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/README.md b/bigtop-bigpetstore/README.md
new file mode 100644
index 0000000..95245a8
--- /dev/null
+++ b/bigtop-bigpetstore/README.md
@@ -0,0 +1,140 @@
+(See accompanying source code for licensing information)
+
+BigPetStore
+============
+
+test mvn deploy1
+
+Apache Bigtop/Hadoop Ecosystem Demo
+-----------------------------------
+This software is created to demonstrate Apache Bigtop for processing
+big data sets.
+
+Architecture
+------------
+The application consists of the following modules
+
+* generator: generates raw data on the dfs
+* clustering: Apache Mahout demo code for processing the data using Itembased Collaborative Filtering
+* Pig: demo code for processing the data using Apache Pig
+* Hive: demo code for processing the data using Apache Hive demo code
+* Crunch: demo code for processing the data using Apache Crunch
+
+Build Instructions
+------------------
+
+* BUILD THE JAR
+
+ "mvn clean package" will build the bigpetstore jar
+
+* Run Intergration tests with
+
+ * Pig profile: mvn clean verify -P pig
+ * Crunch profile: mvn clean verify -P crunch
+ * Hive provile:
+ * First, see and run the setuphive.sh script. Read it and try to under
+ stand what it does.
+
+ * mvn clean verify -P pig
+
+For Eclipse Users
+-----------------
+
+1) run "mvn eclipse:eclipse" to create an IDE loadable project.
+
+2) open .classpath and add
+ `<classpathentry kind="src" path="src/integration/java" including="**/*.java"/>`
+
+3) import the project into eclipse
+
+
+High level summary
+------------------
+
+
+The bigpetstore project exemplifies the hadoop ecosystem for newcomers, and also for benchmarking and
+comparing functional space of tools.
+
+The end goal is to run many different implementations of each phase
+using different tools, thus exemplifying overlap of tools in the hadoop ecosystem, and allowing people to benchmark/compare tools
+using a common framework and easily understood use case
+
+
+How it works (To Do)
+--------------------
+
+* Phase 1: Generating pet store data:
+
+The first step is to generate a raw data set. This is done by the "GeneratePetStoreTransactionsInputFormat":
+
+The first MapReduce job in the pipeline runs a simple job which takes this input format and forwards
+its output. The result is a list of "transactions". Each transaction is a tuple of the format
+
+ *{state,name,date,price,product}.*
+
+* Phase 2: Processing the data
+
+The next phase of the application processes the data to create basic aggregations.
+For example with both pig and hive these could easily include
+
+ *Number of transactions by state* or
+ *Most valuable customer by state* or
+ *Most popular items by state*
+
+
+* Phase 3: Clustering the states by all fields
+
+ Now, say we want to cluster the states, so as to put different states into different buying categories
+ for our marketing team to deal with differently.
+
+* Phase 4: Visualizing the Data in D3.
+
+ - try it [on the gh-pages branch](http://jayunit100.github.io/bigpetstore/)
+
+Running on a hadoop cluster
+---------------------------
+
+wget s3://bigpetstore/bigpetstore.jar
+
+hadoop jar bigpetstore.jar org.apache.bigtop.bigpetstore.generator.BPSGenerator 1000000 bigpetstore/gen
+
+hadoop jar bigpetstore.jar org.apache.bigtop.bigpetstore.etl.PigCSVCleaner bigpetstore/gen/ bigpetstore/pig/ custom_pigscript.pig
+... (will add more steps as we add more phases to the workflow) ...
+
+
+Example of running in EMR
+--------------------------
+- Put the jar in s3. Right now there is a copy of it at the url below.
+
+- Download the elastic-mapreduce ruby shell script.
+create your "credentials.json" file.
+
+Now run this to generate 1,000,000 pet store transactions:
+
+./elastic-mapreduce --create --jar s3://bigpetstore/bigpetstore.jar \
+--main-class org.apache.bigtop.bigpetstore.generator.BPSGenerator \
+--num-instances 10 \
+--arg 1000000 \
+--arg s3://bigpetstore/data/generated \
+--hadoop-version "2.2.0" \
+--master-instance-type m1.medium \
+--slave-instance-type m1.medium
+
+...Now lets clean the data with pig...
+
+Replace the above "main-class", and "--arg" options with
+--main-class org.apache.bigtop.bigpetstore.etl.PigCSVCleaner
+--arg s3://bigpetstore/data/generated
+--arg s3://bigpetstore/data/pig_out
+(optional, you can send a script referencing the cleaned $input path to do some
+custom analytics, see the BPS_Analytics.pig script and companion
+http://jayunit100.github.io/bigpetstore) as an example).
+--arg s3://path_to_custom_analytics_script.pig
+
+(note about pig: We support custom pig scripts.... for EMR, custom pig scripts will need to point to a
+local path, so youll have to put that script on the machine as part
+of EMR setup w/ a custom script).
+
+...
+
+And so on.
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/arch.dot
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/arch.dot b/bigtop-bigpetstore/arch.dot
new file mode 100644
index 0000000..4eb8ac4
--- /dev/null
+++ b/bigtop-bigpetstore/arch.dot
@@ -0,0 +1,44 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+digraph bigpetstore {
+
+ node [shape=record];
+
+ PIG_ANALYTICS [label="PIG_ANALYTICS|Unstructured-unsupported-pigscripts| pig_ad_hoc(0-n)"];
+
+ CUSTOMER_PAGE [label="CUSTOMER_PAGE|json|CUSTOMER_PAGE/part*"];
+ DIRTY_CSV [label="DIRTY_CSV|fname lname -prod , price ,prod,..|generated/part*"];
+ CSV [label="CSV|fname,lname,prod,price,date,xcoord,ycoord,...|cleaned/part*"];
+ MAHOUT_VIEW_INPUT [label="MAHOUT_VIEW | (hashed name) 10001, (hashed purchases) 203 | <hive_warehouse>/mahout_cf_in/part*" ];
+ MAHOUT_CF [label="MAHOUT_CF | (hashed name) 10001, (hashed product) 201, .6 | mahout_cf_out/part*" ];
+
+ Generate -> DIRTY_CSV [label="hadoop jar bigpetstore.jar org.bigtop.bigpetstore.generator.BPSGenerator 100 bps/generated/"] ;
+ DIRTY_CSV -> pig [label=""];
+
+ pig -> CSV [label="hadoop jar bigpetstore.jar org.bigtop.bigpetstore.etl.PigCSVCleaner bps/generated/ bps/cleaned/"];
+ pig -> PIG_ANALYTICS [label="same as CSV job, but add your scripts to end... p1.pig p2.pig ..."];
+ PIG_ANALYTICS -> CSV;
+ PROD_HASH -> hive [label="hive hash udf"];
+ USER_HASH -> hive [label="hive hash udf"];
+
+ CSV -> hive ;
+ hive -> MAHOUT_VIEW_INPUT [label="hadoop jar bigpetstore.jar org.bigtop.bigpetstore.etl.HiveViewCreator bps/pig_out mahout_cf_in"];
+ MAHOUT_VIEW_INPUT -> mahout_collab_filter_recomender -> MAHOUT_CF;
+ MAHOUT_CF -> crunch ;
+ CSV -> crunch ;
+ crunch -> CUSTOMER_PAGE [label="high performance joining"];
+
+}
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/pom.xml
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/pom.xml b/bigtop-bigpetstore/pom.xml
new file mode 100644
index 0000000..0bc226e
--- /dev/null
+++ b/bigtop-bigpetstore/pom.xml
@@ -0,0 +1,797 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.bigtop</groupId>
+ <artifactId>bigtop</artifactId>
+ <version>0.8.0-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>BigPetStore</artifactId>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+ <slf4j.version>1.7.5</slf4j.version>
+ <guava.version>15.0</guava.version>
+ <hadoop.version>2.2.0</hadoop.version>
+ <derby.version>10.8.1.2</derby.version>
+ <hive.version>0.12.0</hive.version>
+ <datanucleus.version>3.2.2</datanucleus.version>
+ <datanucleus.jpa.version>3.2.1</datanucleus.jpa.version>
+ <bonecp.version>0.8.0.RELEASE</bonecp.version>
+ <derby.version>10.10.1.1</derby.version>
+ </properties>
+
+ <dependencies>
+
+ <dependency>
+ <groupId>org.kohsuke</groupId>
+ <artifactId>graphviz-api</artifactId>
+ <version>1.0</version>
+ </dependency>
+
+ <!-- CRUNCH : These are repeated in the profile and necessary for compilation
+ even without the profile -->
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-core</artifactId>
+ <version>0.9.0-hadoop2</version>
+ </dependency>
+
+ <!-- misc deps -->
+ <dependency>
+ <groupId>com.jolbox</groupId>
+ <artifactId>bonecp</artifactId>
+ <version>${bonecp.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.derby</groupId>
+ <artifactId>derby</artifactId>
+ <version>${derby.version}</version>
+ </dependency>
+ <!-- <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId>
+ <version>3.1</version> </dependency> -->
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>15.0</version>
+ </dependency>
+
+ <!--
+ We keep this at top level so that mvn eclipse:eclipse creates a nice
+ tidy project, but its a little messy. later we'll create a profile for
+ eclipse and move this (and other deps) into profiles as needed.
+ Important: Remove this dependency when running hive integration tests...
+ -->
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <!-- mahout deps : may need to turn these on/off when testing mahout locally-->
+
+ <dependency> <groupId>org.apache.mahout</groupId> <artifactId>mahout-core</artifactId>
+ <version>0.9</version> <exclusions> </exclusions> </dependency>
+ <!-- pig deps -->
+ <dependency>
+ <groupId>org.apache.pig</groupId>
+ <artifactId>pig</artifactId>
+ <classifier>h2</classifier>
+ <version>0.12.0</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <!--logging -->
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+
+ <!-- SL4J Binding provided at runtime -->
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ <version>1.2.12</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>${slf4j.version}</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- hive -->
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-common</artifactId>
+ <version>${hive.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-serde</artifactId>
+ <version>${hive.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-jdbc</artifactId>
+ <version>${hive.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-contrib</artifactId>
+ <version>${hive.version}</version>
+ </dependency>
+
+ <!-- datanucleus -->
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-core</artifactId>
+ <version>${datanucleus.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-rdbms</artifactId>
+ <version>${datanucleus.jpa.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-api-jdo</artifactId>
+ <version>${datanucleus.jpa.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-accessplatform-jdo-rdbms</artifactId>
+ <version>${datanucleus.jpa.version}</version>
+ <type>pom</type>
+ </dependency>
+
+ <!-- Unit test artifacts -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.11</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <version>1.3</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.mrunit</groupId>
+ <artifactId>mrunit</artifactId>
+ <version>1.0.0</version>
+ <classifier>hadoop2</classifier>
+ </dependency>
+
+ </dependencies>
+
+ <build>
+ <extensions>
+ <extension>
+ <groupId>org.springframework.build.aws</groupId>
+ <artifactId>org.springframework.build.aws.maven</artifactId>
+ <version>3.0.0.RELEASE</version>
+ </extension>
+ </extensions>
+ <finalName>bigpetstore-${version}</finalName>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-release-plugin</artifactId>
+ <version>2.5</version>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-eclipse-plugin</artifactId>
+ <configuration>
+ <downloadSources>true</downloadSources>
+ <downloadJavadocs>true</downloadJavadocs>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <version>2.4</version>
+ <configuration>
+ <outputDirectory>${basedir}/target</outputDirectory>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <exclude>**/*TestPig.java</exclude>
+ <exclude>**/*TestHiveEmbedded.java</exclude>
+ <exclude>**/*TestCrunch.java</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>pig</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ <properties>
+ <skip.unit.tests>false</skip.unit.tests>
+ </properties>
+ <dependencies>
+ <!-- misc -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ <version>3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ <version>2.3</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>${guava.version}</version>
+ </dependency>
+
+ <!-- pig -->
+ <dependency>
+ <groupId>org.apache.pig</groupId>
+ <artifactId>pig</artifactId>
+ <classifier>h2</classifier>
+ <version>0.12.0</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- hadoop -->
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <!-- <dependency> <groupId>org.apache.mrunit</groupId> <artifactId>mrunit</artifactId>
+ <version>1.0.0</version> <classifier>hadoop2</classifier> </dependency> -->
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+
+ <excludes>
+ <exclude>**/*TestPig.java</exclude>
+ <exclude>**/*TestHiveEmbedded.java</exclude>
+ <exclude>**/*TestCrunch.java</exclude>
+ <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+ </excludes>
+
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <version>1.5</version>
+ <executions>
+ <execution>
+ <id>add-test-source</id>
+ <phase>generate-test-sources</phase>
+ <goals>
+ <goal>add-test-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>src/integration/java</source>
+ </sources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.12</version>
+
+ <configuration>
+ <argLine>-Xmx1g</argLine>
+ <excludes>
+ <exclude>**/*BigPetStoreMahoutIT.java</exclude>
+ <exclude>**/*BigPetStoreHiveIT.java</exclude>
+ <exclude>**/*BigPetStoreCrunchIT.java</exclude>
+ </excludes>
+ </configuration>
+ <executions>
+ <!-- States that both integration-test and verify goals of the Failsafe
+ Maven plugin are executed. -->
+ <execution>
+ <id>integration-tests</id>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+
+ <profile>
+ <id>hive</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ <properties>
+ <derby.version>10.8.1.2</derby.version>
+ <hive.version>0.12.0</hive.version>
+ <datanucleus.version>3.2.2</datanucleus.version>
+ <datanucleus.jpa.version>3.2.1</datanucleus.jpa.version>
+ <bonecp.version>0.8.0.RELEASE</bonecp.version>
+ <derby.version>10.10.1.1</derby.version>
+ <skip.unit.tests>false</skip.unit.tests>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+
+ <excludes>
+ <exclude>**/*TestPig.java</exclude>
+ <exclude>**/*TestHiveEmbedded.java</exclude>
+ <exclude>**/*TestCrunch.java</exclude>
+ <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+ </excludes>
+
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <version>1.5</version>
+ <executions>
+ <execution>
+ <id>add-test-source</id>
+ <phase>generate-test-sources</phase>
+ <goals>
+ <goal>add-test-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>src/integration/java</source>
+ </sources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.12</version>
+ <configuration>
+ <excludes>
+ <exclude>**/*BigPetStoreMahoutIT.java</exclude>
+ <exclude>**/*BigPetStorePigIT.java</exclude>
+ <exclude>**/*BigPetStoreCrunchIT.java</exclude>
+ </excludes>
+ </configuration>
+ <executions>
+ <!-- States that both integration-test and verify goals of the Failsafe
+ Maven plugin are executed. -->
+ <execution>
+ <id>integration-tests</id>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+
+ <dependencies>
+ <!-- misc -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ <version>3.1</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>${guava.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.derby</groupId>
+ <artifactId>derby</artifactId>
+ <version>${derby.version}</version>
+ </dependency>
+
+
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-core</artifactId>
+ <version>${datanucleus.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-rdbms</artifactId>
+ <version>${datanucleus.jpa.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-api-jdo</artifactId>
+ <version>${datanucleus.jpa.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.datanucleus</groupId>
+ <artifactId>datanucleus-accessplatform-jdo-rdbms</artifactId>
+ <version>${datanucleus.jpa.version}</version>
+ <type>pom</type>
+ </dependency>
+
+ <!-- hadoop -->
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-mapreduce-client-app</artifactId>
+ <version>2.3.0</version>
+ </dependency>
+ <!-- hive -->
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-common</artifactId>
+ <version>${hive.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-serde</artifactId>
+ <version>${hive.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-jdbc</artifactId>
+ <version>${hive.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-contrib</artifactId>
+ <version>${hive.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.jolbox</groupId>
+ <artifactId>bonecp</artifactId>
+ <version>${bonecp.version}</version>
+ </dependency>
+
+ <!-- logging -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+
+ <!-- SL4J Binding provided at runtime -->
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ <version>1.2.12</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>${slf4j.version}</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Unit test artifacts -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.11</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <version>1.3</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.mrunit</groupId>
+ <artifactId>mrunit</artifactId>
+ <version>1.0.0</version>
+ <classifier>hadoop2</classifier>
+ </dependency>
+
+ </dependencies>
+ </profile>
+ <profile>
+ <id>crunch</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ <properties>
+ <skip.unit.tests>true</skip.unit.tests>
+ </properties>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <exclude>**/*TestPig.java</exclude>
+ <exclude>**/*TestHiveEmbedded.java</exclude>
+ <exclude>**/*TestCrunch.java</exclude>
+ <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <version>1.5</version>
+ <executions>
+ <execution>
+ <id>add-test-source</id>
+ <phase>generate-test-sources</phase>
+ <goals>
+ <goal>add-test-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>src/integration/java</source>
+ </sources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.12</version>
+ <configuration>
+ <excludes>
+ <exclude>**/*BigPetStorePigIT.java</exclude>
+ <exclude>**/*BigPetStoreHiveIT.java</exclude>
+ <exclude>**/*BigPetStoreMahoutIT.java</exclude>
+ </excludes>
+ </configuration>
+ <executions>
+ <!-- States that both integration-test and verify goals of the Failsafe
+ Maven plugin are executed. -->
+ <execution>
+ <id>integration-tests</id>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-core</artifactId>
+ <version>0.9.0-hadoop2</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.protobuf</groupId>
+ <artifactId>protobuf-java</artifactId>
+ <version>2.5.0</version>
+ </dependency>
+ </dependencies>
+ </profile>
+
+ <profile>
+ <id>mahout</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ <properties>
+ <skip.unit.tests>true</skip.unit.tests>
+ </properties>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <exclude>**/*TestPig.java</exclude>
+ <exclude>**/*TestHiveEmbedded.java</exclude>
+ <exclude>**/*TestCrunch.java</exclude>
+ <exclude>**/*TestPetStoreTransactionGeneratorJob.java</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <version>1.5</version>
+ <executions>
+ <execution>
+ <id>add-test-source</id>
+ <phase>generate-test-sources</phase>
+ <goals>
+ <goal>add-test-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>src/integration/java</source>
+ </sources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.12</version>
+ <configuration>
+ <excludes>
+ <exclude>**/*BigPetStorePigIT.java</exclude>
+ <exclude>**/*BigPetStoreCrunchIT.java</exclude>
+ <exclude>**/*BigPetStoreHiveIT.java</exclude>
+ </excludes>
+ </configuration>
+ <executions>
+ <!-- States that both integration-test and verify goals of the Failsafe
+ Maven plugin are executed. -->
+ <execution>
+ <id>integration-tests</id>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>1.1.3</version>
+ </dependency>
+
+ <!--
+ For testing on my machine,
+ I created a bigpetstore mahout jar which
+ is compiled for 2.2.0 . Or substitute this with
+ the standard apache mahout-core but not sure if it
+ will work.
+ -->
+ <dependency>
+ <groupId>bigpetstore</groupId>
+ <artifactId>mahout-core</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <exclusions>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-math</artifactId>
+ <version>0.9</version>
+ </dependency>
+
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>LATEST</version>
+
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ <version>LATEST</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.thoughtworks.xstream</groupId>
+ <artifactId>xstream</artifactId>
+ <version>LATEST</version>
+
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>LATEST</version>
+
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <version>LATEST</version>
+
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.mahout.commons</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>LATEST</version>
+
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-math3</artifactId>
+ <version>LATEST</version>
+ </dependency>
+
+
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-commons-csv</artifactId>
+ <version>3.5.0</version>
+ </dependency>
+
+ </dependencies>
+
+
+
+ </profile>
+
+ </profiles>
+
+</project>
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/setuphive.sh
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/setuphive.sh b/bigtop-bigpetstore/setuphive.sh
new file mode 100755
index 0000000..8dff6dd
--- /dev/null
+++ b/bigtop-bigpetstore/setuphive.sh
@@ -0,0 +1,22 @@
+### THIS SCRIPT SETS UP HIVE AND HADOOP TARBALLS FOR YOU ###
+HIVE_TARBALL="http://archive.apache.org/dist/hive/hive-0.12.0/hive-0.12.0.tar.gz"
+HADOOP_TARBALL="https://archive.apache.org/dist/hadoop/core/hadoop-1.2.1/hadoop-1.2.1.tar.gz"
+wget $HIVE_TARBALL
+wget $HADOOP_TARBALL
+
+
+# REMEBER SO WE CAN CD BACK AT END
+mydir=`pwd`
+
+## HADOOP SETUP
+mkdir -p /opt/bigpetstore
+cd /opt/bigpetstore
+tar -xvf hadoop-1.2.1.tar.gz
+export HADOOP_HOME=`pwd`/hadoop-1.2.1
+
+## HIVE SETUP
+tar -xvf hive-0.12.0.tar.gz
+cp /opt/hive-0.12.0/lib/hive*.jar $HADOOP_HOME/lib
+
+## CD BACK TO ORIGINAL DIR
+cd $mydir
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreHiveIT.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreHiveIT.java b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreHiveIT.java
new file mode 100644
index 0000000..c3646a4
--- /dev/null
+++ b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreHiveIT.java
@@ -0,0 +1,108 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.bigtop.bigpetstore;
+
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.bigtop.bigpetstore.ITUtils;
+import org.apache.bigtop.bigpetstore.etl.HiveViewCreator;
+import org.apache.bigtop.bigpetstore.etl.PigCSVCleaner;
+import org.apache.bigtop.bigpetstore.generator.BPSGenerator;
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.pig.ExecType;
+import org.json.JSONException;
+import org.json.JSONObject;
+
+import com.google.common.base.Function;
+import com.google.common.io.Files;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Run this after running the @link{BigPetStorePigIT} test.
+ * Duh...
+ */
+public class BigPetStoreHiveIT extends ITUtils{
+ final static Logger log = LoggerFactory.getLogger(BigPetStoreHiveIT.class);
+
+ @Before
+ public void setupTest() throws Throwable {
+ super.setup();
+ try {
+ FileSystem.get(new Configuration()).delete(BPS_TEST_MAHOUT_IN);
+ } catch (Exception e) {
+ System.out.println("didnt need to delete hive output.");
+ // not necessarily an error
+ }
+ }
+
+ @Test
+ public void testPetStorePipeline() throws Exception {
+ new HiveViewCreator().run(
+ new String[]{
+ BPS_TEST_PIG_CLEANED.toString(),
+ BPS_TEST_MAHOUT_IN.toString()});
+
+ assertOutput(BPS_TEST_MAHOUT_IN, new Function<String, Boolean>() {
+ public Boolean apply(String x) {
+ System.out.println("Verifying "+x);
+ String[] cols = x.split(",");
+ Long.parseLong(cols[0].trim());
+ Long.parseLong(cols[1].trim());
+ Long.parseLong(cols[2].trim());
+ return true;
+ }
+ });
+ }
+
+ public static void assertOutput(Path base,
+ Function<String, Boolean> validator) throws Exception {
+ FileSystem fs = FileSystem.getLocal(new Configuration());
+
+ FileStatus[] files = fs.listStatus(base);
+ // print out all the files.
+ for (FileStatus stat : files) {
+ System.out.println(stat.getPath() + " " + stat.getLen());
+ }
+
+ Path p = new Path(base, "000000_0");
+ BufferedReader r = new BufferedReader(new InputStreamReader(fs.open(p)));
+
+ // line:{"product":"big chew toy","count":3}
+ while (r.ready()) {
+ String line = r.readLine();
+ log.info("line:" + line);
+ System.out.println("line:" + line);
+ Assert.assertTrue("validationg line : " + line,
+ validator.apply(line));
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java
new file mode 100644
index 0000000..5e6f69c
--- /dev/null
+++ b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStoreMahoutIT.java
@@ -0,0 +1,88 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.bigtop.bigpetstore;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+import org.apache.bigtop.bigpetstore.clustering.BPSRecommnder;
+import org.apache.bigtop.bigpetstore.etl.HiveViewCreator;
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Function;
+
+public class BigPetStoreMahoutIT extends ITUtils{
+
+ final static Logger log = LoggerFactory.getLogger(BigPetStoreHiveIT.class);
+
+ @Before
+ public void setupTest() throws Throwable {
+ super.setup();
+ try {
+ FileSystem.get(new Configuration()).delete(super.BPS_TEST_MAHOUT_OUT);
+ }
+ catch (Exception e) {
+ System.out.println("didnt need to delete mahout output.");
+ }
+ }
+
+ @Test
+ public void testPetStorePipeline() throws Exception {
+ new BPSRecommnder().run(
+ new String[]{
+ BPS_TEST_MAHOUT_IN.toString(),
+ BPS_TEST_MAHOUT_OUT.toString()});
+
+ assertOutput(BPS_TEST_MAHOUT_OUT, new Function<String, Boolean>() {
+ public Boolean apply(String x) {
+ System.out.println("Verifying "+x);
+ return true;
+ }
+ });
+ }
+
+ public static void assertOutput(Path base,
+ Function<String, Boolean> validator) throws Exception {
+ FileSystem fs = FileSystem.getLocal(new Configuration());
+
+ FileStatus[] files = fs.listStatus(base);
+ // print out all the files.
+ for (FileStatus stat : files) {
+ System.out.println(stat.getPath() + " " + stat.getLen());
+ }
+
+ Path p = new Path(base, "part-r-00000");
+ BufferedReader r = new BufferedReader(new InputStreamReader(fs.open(p)));
+
+ // line:{"product":"big chew toy","count":3}
+ while (r.ready()) {
+ String line = r.readLine();
+ log.info("line:" + line);
+ System.out.println("line:" + line);
+ Assert.assertTrue("validationg line : " + line,
+ validator.apply(line));
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java
new file mode 100644
index 0000000..db766de
--- /dev/null
+++ b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/BigPetStorePigIT.java
@@ -0,0 +1,165 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.bigtop.bigpetstore;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.InputStreamReader;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import junit.framework.Assert;
+
+import org.apache.bigtop.bigpetstore.etl.PigCSVCleaner;
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.pig.ExecType;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.io.Files;
+
+/**
+* This is the main integration test for pig.
+* Like all BPS integration tests, it is designed
+* to simulate exactly what will happen on the
+* actual cluster, except with a small amount of records.
+*
+* In addition to cleaning the dataset, it also runs the BPS_analytics.pig
+* script which BigPetStore ships with.
+*/
+public class BigPetStorePigIT extends ITUtils{
+
+ final static Logger log = LoggerFactory.getLogger(BigPetStorePigIT.class);
+
+ /**
+ * An extra unsupported code path that we have so
+ * people can do ad hoc analytics on pig data after it is
+ * cleaned.
+ */
+ public static final Path BPS_TEST_PIG_COUNT_PRODUCTS = fs.makeQualified(
+ new Path("bps_integration_",
+ BigPetStoreConstants.OUTPUTS.pig_ad_hoc_script.name()+"0"));
+
+ static final File PIG_SCRIPT = new File("BPS_analytics.pig");
+
+ static {
+ if(PIG_SCRIPT.exists()) {
+
+ }
+ else
+ throw new RuntimeException("Couldnt find pig script at " + PIG_SCRIPT.getAbsolutePath());
+ }
+
+ @Before
+ public void setupTest() throws Throwable {
+ super.setup();
+ try{
+ FileSystem.get(new Configuration()).delete(BPS_TEST_PIG_CLEANED);
+ FileSystem.get(new Configuration()).delete(BPS_TEST_PIG_COUNT_PRODUCTS);
+ }
+ catch(Exception e){
+ System.out.println("didnt need to delete pig output.");
+ //not necessarily an error
+ }
+ }
+
+ static Map<Path,Function<String,Boolean>> TESTS = ImmutableMap.of(
+ /**
+ * Test of the main output
+ */
+ BPS_TEST_PIG_CLEANED,
+ new Function<String, Boolean>(){
+ public Boolean apply(String x){
+ //System.out.println("Verified...");
+ return true;
+ }
+ },
+ //Example of how to count products
+ //after doing basic pig data cleanup
+ BPS_TEST_PIG_COUNT_PRODUCTS,
+ new Function<String, Boolean>(){
+ //Jeff'
+ public Boolean apply(String x){
+ return true;
+ }
+ });
+
+ /**
+ * The "core" task reformats data to TSV. lets test that first.
+ */
+ @Test
+ public void testPetStoreCorePipeline() throws Exception {
+ runPig(
+ BPS_TEST_GENERATED,
+ BPS_TEST_PIG_CLEANED,
+ PIG_SCRIPT);
+ for(Entry<Path,Function<String,Boolean>> e : TESTS.entrySet()) {
+ assertOutput(e.getKey(),e.getValue());
+ }
+ }
+
+ public static void assertOutput(Path base,Function<String, Boolean> validator) throws Exception{
+ FileSystem fs = FileSystem.getLocal(new Configuration());
+
+ FileStatus[] files=fs.listStatus(base);
+ //print out all the files.
+ for(FileStatus stat : files){
+ System.out.println(stat.getPath() +" " + stat.getLen());
+ }
+
+ /**
+ * Support map OR reduce outputs
+ */
+ Path partm = new Path(base,"part-m-00000");
+ Path partr = new Path(base,"part-r-00000");
+ Path p = fs.exists(partm)?partm:partr;
+
+ /**
+ * Now we read through the file and validate
+ * its contents.
+ */
+ BufferedReader r =
+ new BufferedReader(
+ new InputStreamReader(fs.open(p)));
+
+ //line:{"product":"big chew toy","count":3}
+ while(r.ready()){
+ String line = r.readLine();
+ log.info("line:"+line);
+ //System.out.println("line:"+line);
+ Assert.assertTrue("validationg line : " + line, validator.apply(line));
+ }
+ }
+
+ Map pigResult;
+
+ private void runPig(Path input, Path output, File pigscript) throws Exception {
+
+ new PigCSVCleaner(
+ input,
+ output,
+ ExecType.LOCAL,
+ pigscript);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/ITUtils.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/ITUtils.java b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/ITUtils.java
new file mode 100644
index 0000000..e93d9ce
--- /dev/null
+++ b/bigtop-bigpetstore/src/integration/java/org/apache/bigtop/bigpetstore/ITUtils.java
@@ -0,0 +1,145 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.bigtop.bigpetstore;
+
+import java.net.InetAddress;
+import java.nio.charset.Charset;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.bigtop.bigpetstore.generator.BPSGenerator;
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.io.Files;
+
+public class ITUtils {
+
+ static final Logger log = LoggerFactory.getLogger(ITUtils.class);
+
+ static FileSystem fs;
+ static{
+ try{
+ fs=FileSystem.getLocal(new Configuration());
+ }
+ catch(Throwable e)
+ {
+ String cpath = (String) System.getProperties().get("java.class.path");
+ String msg="";
+ for(String cp : cpath.split(":")) {
+ if(cp.contains("hadoop")) {
+ msg+=cp.replaceAll("hadoop", "**HADOOP**")+"\n";
+ }
+ }
+ throw new RuntimeException("Major error: Probably issue. " +
+ "Check hadoop version? "+ e.getMessage() +" .... check these classpath elements:"
+ +msg);
+ }
+ }
+ public static final Path BPS_TEST_GENERATED = fs.makeQualified(
+ new Path("bps_integration_",BigPetStoreConstants.OUTPUTS.generated.name())) ;
+
+ public static final Path BPS_TEST_PIG_CLEANED = fs.makeQualified(
+ new Path("bps_integration_",BigPetStoreConstants.OUTPUTS.cleaned.name()));
+
+ public static final Path BPS_TEST_MAHOUT_IN = fs.makeQualified(
+ new Path("bps_integration_",BigPetStoreConstants.OUTPUTS.MAHOUT_CF_IN.name()));
+
+ public static final Path BPS_TEST_MAHOUT_OUT = fs.makeQualified(
+ new Path("bps_integration_",BigPetStoreConstants.OUTPUTS.MAHOUT_CF_OUT.name()));
+
+ public static void main(String[] args){
+
+ }
+ //public static final Path CRUNCH_OUT = new Path("bps_integration_",BigPetStoreConstants.OUTPUT_3).makeQualified(fs);
+
+ /**
+ * Some simple checks to make sure that unit tests in local FS.
+ * these arent designed to be run against a distribtued system.
+ */
+ public static void checkConf(Configuration conf) throws Exception {
+ if(conf.get("mapreduce.jobtracker.address")==null) {
+ log.warn("Missing mapreduce.jobtracker.address???????!!!! " +
+ "This can be the case in hive tests which use special " +
+ "configurations, but we should fix it sometime.");
+ return;
+ }
+ if(! conf.get("mapreduce.jobtracker.address").equals("local")) {
+ throw new RuntimeException("ERROR: bad conf : " + "mapreduce.jobtracker.address");
+ }
+ if(! conf.get("fs.AbstractFileSystem.file.impl").contains("Local")) {
+ throw new RuntimeException("ERROR: bad conf : " + "mapreduce.jobtracker.address");
+ }
+ try {
+ InetAddress addr = java.net.InetAddress.getLocalHost();
+ System.out.println("Localhost = hn=" + addr.getHostName() +" / ha="+addr.getHostAddress());
+ }
+ catch (Throwable e) {
+ throw new RuntimeException(
+ " ERROR : Hadoop wont work at all on this machine yet"+
+ "...I can't get / resolve localhost ! Check java version/ " +
+ "/etc/hosts / DNS or other networking related issues on your box" +
+ e.getMessage());
+ }
+ }
+
+
+ /**
+ * Creates a generated input data set in
+ *
+ * test_data_directory/generated.
+ * i.e.
+ * test_data_directory/generated/part-r-00000
+ */
+ public static void setup() throws Throwable{
+ int records = 10;
+ /**
+ * Setup configuration with prop.
+ */
+ Configuration conf = new Configuration();
+
+ //debugging for jeff and others in local fs
+ //that wont build
+ checkConf(conf);
+
+ conf.setInt(BPSGenerator.props.bigpetstore_records.name(), records);
+
+ /**
+ * Only create if doesnt exist already.....
+ */
+ if(FileSystem.getLocal(conf).exists(BPS_TEST_GENERATED)){
+ return;
+ }
+
+ /**
+ * Create the data set.
+ */
+ Job createInput= BPSGenerator.createJob(BPS_TEST_GENERATED, conf);
+ createInput.waitForCompletion(true);
+
+ Path outputfile = new Path(BPS_TEST_GENERATED,"part-r-00000");
+ List<String> lines = Files.readLines(FileSystem.getLocal(conf).pathToFile(outputfile), Charset.defaultCharset());
+ log.info("output : " + FileSystem.getLocal(conf).pathToFile(outputfile));
+ for(String l : lines){
+ System.out.println(l);
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/clustering/BPSRecommnder.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/clustering/BPSRecommnder.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/clustering/BPSRecommnder.java
new file mode 100644
index 0000000..748578a
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/clustering/BPSRecommnder.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.clustering;
+
+import org.apache.bigtop.bigpetstore.util.DeveloperTools;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob;
+import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
+import org.apache.pig.builtin.LOG;
+
+/**
+ * Implement user based collab filter.
+ *
+ * The input set is the
+ *
+ * userid,productid,weight
+ *
+ * rows.
+ */
+public class BPSRecommnder implements Tool {
+
+
+ Configuration c;
+ @Override
+ public void setConf(Configuration conf) {
+ c=conf;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return c;
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ DeveloperTools.validate(args,"input path","output path");
+
+ Configuration conf = new Configuration();
+
+ System.out.println("Runnning recommender against : " + args[0] +" -> " + args[1]);
+
+ RecommenderJob recommenderJob = new RecommenderJob();
+ /**
+ int x = ToolRunner.run(getConf(), new BPSPreparePreferenceMatrixJob(), new String[]{
+ "--input", args[0],
+ "--output", args[1],
+ "--tempDir", "/tmp",
+ });
+ System.out.println("RETURN = " + x);
+ **/
+
+ int ret = recommenderJob.run(new String[] {
+ "--input",args[0],
+ "--output",args[1],
+ "--usersFile","/tmp/users.txt",
+ "--tempDir", "/tmp/mahout_"+System.currentTimeMillis(),
+ "--similarityClassname", "SIMILARITY_PEARSON_CORRELATION",
+ "--threshold",".00000000001",
+ "--numRecommendations", "4",
+ //"--encodeLongsAsInts",
+ //Boolean.FALSE.toString(),
+ //"--itemBased", Boolean.FALSE.toString()
+ });
+
+ System.out.println("Exit of recommender: " + ret);
+ return ret;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java
new file mode 100755
index 0000000..ed618a8
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/contract/PetStoreStatistics.java
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.bigtop.bigpetstore.contract;
+
+import java.util.Map;
+
+/**
+ * This is the contract for the web site. This object is created by each ETL
+ * tool : Summary stats.
+ */
+public abstract class PetStoreStatistics {
+
+ public abstract Map<String, ? extends Number> numberOfTransactionsByState()
+ throws Exception;
+
+ public abstract Map<String, ? extends Number> numberOfProductsByProduct()
+ throws Exception;
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/CrunchETL.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/CrunchETL.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/CrunchETL.java
new file mode 100755
index 0000000..f6f459c
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/CrunchETL.java
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.etl;
+
+import java.util.Map;
+
+import org.apache.bigtop.bigpetstore.contract.PetStoreStatistics;
+import org.apache.crunch.FilterFn;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.From;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class CrunchETL extends PetStoreStatistics {
+
+ public static MapFn<LineItem, String> COUNT_BY_PRODUCT = new MapFn<LineItem, String>() {
+ public String map(LineItem lineItem) {
+ try {
+ return lineItem.getDescription();
+ } catch (Throwable t) {
+ throw new RuntimeException(t);
+ }
+ }
+ };
+ public static MapFn<LineItem, String> COUNT_BY_STATE = new MapFn<LineItem, String>() {
+ public String map(LineItem lineItem) {
+ try {
+ return lineItem.getDescription();
+ } catch (Throwable t) {
+ throw new RuntimeException(t);
+ }
+ }
+ };
+
+ PCollection<LineItem> lineItems;
+
+ public CrunchETL(Path input, Path output) throws Exception {
+ Pipeline pipeline = MemPipeline.getInstance();
+ PCollection<String> lines = pipeline.read(From.textFile(new Path(input,
+ "part-r-00000")));
+ System.out.println("crunch : " + lines.getName() + " "
+ + lines.getSize());
+ lineItems = lines.parallelDo(ETL, Avros.reflects(LineItem.class));
+
+ }
+
+ public static MapFn ETL = new MapFn<String, LineItem>() {
+ @Override
+ public LineItem map(String input) {
+ String[] fields = input.split(",");
+ LineItem li = new LineItem();
+ li.setAppName(fields[1]);
+ li.setFirstName(fields[3]);
+ // ...
+ li.setDescription(fields[fields.length - 1]);
+ return li;
+ }
+ };
+
+ @Override
+ public Map<String, ? extends Number> numberOfTransactionsByState()
+ throws Exception {
+ PTable<String, Long> counts = lineItems.parallelDo(COUNT_BY_STATE,
+ Avros.strings()).count();
+ Map m = counts.materializeToMap();
+
+ System.out.println("Crunch::: " + m);
+ return m;
+ }
+
+ @Override
+ public Map<String, ? extends Number> numberOfProductsByProduct()
+ throws Exception {
+ PTable<String, Long> counts = lineItems.parallelDo(COUNT_BY_PRODUCT,
+ Avros.strings()).count();
+ Map m = counts.materializeToMap();
+ //CrunchETL. System.out.println("Crunch::: " + m);
+ return m;
+ }
+
+ public static void main(String... args) throws Exception {
+ /**
+ * PCollection<String> lines = MemPipeline .collectionOf(
+ * "BigPetStore,storeCode_AK,1 lindsay,franco,Sat Jan 10 00:11:10 EST 1970,10.5,dog-food"
+ * "BigPetStore,storeCode_AZ,1 tom,giles,Sun Dec 28 23:08:45 EST 1969,10.5,dog-food"
+ * "BigPetStore,storeCode_CA,1 brandon,ewing,Mon Dec 08 20:23:57 EST 1969,16.5,organic-dog-food"
+ * "BigPetStore,storeCode_CA,2 angie,coleman,Thu Dec 11 07:00:31 EST 1969,10.5,dog-food"
+ * "BigPetStore,storeCode_CA,3 angie,coleman,Tue Jan 20 06:24:23 EST 1970,7.5,cat-food"
+ * "BigPetStore,storeCode_CO,1 sharon,trevino,Mon Jan 12 07:52:10 EST 1970,30.1,antelope snacks"
+ * "BigPetStore,storeCode_CT,1 kevin,fitzpatrick,Wed Dec 10 05:24:13 EST 1969,10.5,dog-food"
+ * "BigPetStore,storeCode_NY,1 dale,holden,Mon Jan 12 23:02:13 EST 1970,19.75,fish-food"
+ * "BigPetStore,storeCode_NY,2 dale,holden,Tue Dec 30 12:29:52 EST 1969,10.5,dog-food"
+ * "BigPetStore,storeCode_OK,1 donnie,tucker,Sun Jan 18 04:50:26 EST 1970,7.5,cat-food"
+ * );
+ **/
+ // FAILS
+ Pipeline pipeline = new MRPipeline(CrunchETL.class);
+
+ PCollection<String> lines = pipeline.read(From.textFile(new Path(
+ "/tmp/BigPetStore1388719888255/generated/part-r-00000")));
+
+
+ PCollection<LineItem> lineItems = lines.parallelDo(
+ new MapFn<String, LineItem>() {
+ @Override
+ public LineItem map(String input) {
+
+ System.out.println("proc1 " + input);
+ String[] fields = input.split(",");
+ LineItem li = new LineItem();
+ li.setAppName("" + fields[1]);
+ li.setFirstName("" + fields[3]);
+ li.setDescription("" + fields[fields.length - 1]);
+ return li;
+ }
+ }, Avros.reflects(LineItem.class));
+
+ for (LineItem i : lineItems.materialize())
+ System.out.println(i);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/HiveViewCreator.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/HiveViewCreator.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/HiveViewCreator.java
new file mode 100755
index 0000000..4fabb6f
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/HiveViewCreator.java
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.etl;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.bigtop.bigpetstore.util.NumericalIdUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.booleanValue_return;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.Tool;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ * Hive View creator is designed to read from Pigs cleaned output.
+ * The basic strategy is:
+ *
+ * 1) store pig output as a hive table
+ * 2) use "select .. as" to select a subset
+ *
+ * Note on running locally:
+ *
+ * 1) Local mode requires a hive and hadoop tarball, with HIVE_HOME and
+ * HADOOP_HOME pointing to it. 2) In HADOOP_HOME, you will need to cp the
+ * HIVE_HOME/lib/hive-serde*jar file into HADOOP_HOME/lib.
+ *
+ * Then, the below queries will run.
+ *
+ * The reason for this is that the hive SerDe stuff is used in the MapReduce
+ * phase of things, so those utils need to be available to hadoop itself. That
+ * is because the regex input/output is processed vthe mappers
+ *
+ */
+public class HiveViewCreator implements Tool {
+
+ static {
+ try{
+ Class.forName("org.apache.hadoop.hive.ql.exec.mr.ExecDriver");
+ System.out.println("found exec driver !!!!!!!!!!!!!!!!");
+ }
+ catch(Throwable t) {
+ throw new RuntimeException(t);
+ }
+ try{
+ //Class.forName("org.apache.hadoop.hive.ql.exec.mr.ExecDriver");
+ }
+ catch(Throwable t) {
+ throw new RuntimeException(t);
+ }
+ }
+ Configuration conf;
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf=conf;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /**
+ * Input args:
+ * Cleaned data files from pig (tsv)
+ * Ouptut table (desired path to mahout input data set)
+ *
+ */
+ @Override
+ public int run(String[] args) throws Exception {
+ Statement stmt = getConnection();
+ stmt.execute("DROP TABLE IF EXISTS " + BigPetStoreConstants.OUTPUTS.MAHOUT_CF_IN.name());
+ System.out.println("input data " + args[0]);
+ System.out.println("output table " + args[1]);
+
+ Path inTablePath = new Path(args[0]);
+ String inTableName = "cleaned"+System.currentTimeMillis();
+ String outTableName = BigPetStoreConstants.OUTPUTS.MAHOUT_CF_IN.name();
+
+ Path outTablePath = new Path (inTablePath.getParent(),outTableName);
+
+ final String create = "CREATE EXTERNAL TABLE "+inTableName+" ("
+ + " dump STRING,"
+ + " state STRING,"
+ + " trans_id STRING,"
+ + " lname STRING,"
+ + " fname STRING,"
+ + " date STRING,"
+ + " price STRING,"
+ + " product STRING"
+ + ") ROW FORMAT "
+ + "DELIMITED FIELDS TERMINATED BY '\t' "
+ + "LINES TERMINATED BY '\n' "
+ + "STORED AS TEXTFILE "
+ + "LOCATION '"+inTablePath+"'";
+ boolean res = stmt.execute(create);
+ System.out.println("Execute return code : " +res);
+ //will change once we add hashes into pig ETL clean
+ String create2 =
+ "create table "+outTableName+" as "+
+ "select hash(concat(state,fname,lname)),',',hash(product),',',1 "
+ + "from "+inTableName;
+
+ System.out.println("CREATE = " + create2 );
+ System.out.println("OUT PATH = " + outTablePath);
+ boolean res2 = stmt.execute(create2);
+
+ String finalOutput = String.format(
+ "INSERT OVERWRITE DIRECTORY '%s' SELECT * FROM %s",outTablePath, outTableName) ;
+
+ stmt.execute(finalOutput);
+ System.out.println("FINAL OUTPUT STORED : " + outTablePath);
+ return 0;
+ }
+
+ public static final String HIVE_JDBC_DRIVER = "org.apache.hive.jdbc.HiveDriver";
+ public static final String HIVE_JDBC_EMBEDDED_CONNECTION = "jdbc:hive2://";
+
+ final static Logger log = LoggerFactory.getLogger(HiveViewCreator.class);
+
+
+ private Statement getConnection() throws ClassNotFoundException,
+ SQLException {
+ Class.forName(HIVE_JDBC_DRIVER);
+ Connection con = DriverManager.getConnection(
+ HIVE_JDBC_EMBEDDED_CONNECTION, "", "");
+ System.out.println("hive con = " + con.getClass().getName());
+ Statement stmt = con.createStatement();
+ return stmt;
+ }
+
+ public static void main(String[] args) throws Exception {
+ new HiveViewCreator()
+ .run(args);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/LineItem.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/LineItem.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/LineItem.java
new file mode 100755
index 0000000..87e5d0d
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/LineItem.java
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.etl;
+
+import java.io.Serializable;
+
+public class LineItem implements Serializable{
+
+ public LineItem(String appName, String storeCode, Integer lineId, String firstName, String lastName, String timestamp, Double price, String description){
+ super();
+ this.appName=appName;
+ this.storeCode=storeCode;
+ this.lineId=lineId;
+ this.firstName=firstName;
+ this.lastName=lastName;
+ this.timestamp=timestamp;
+ this.price=price;
+ this.description=description;
+ }
+
+ String appName;
+ String storeCode;
+ Integer lineId;
+ String firstName;
+ String lastName;
+ String timestamp;
+ Double price;
+ String description;
+
+ public LineItem(){
+ super();
+ }
+
+ public String getAppName(){
+ return appName;
+ }
+
+ public void setAppName(String appName){
+ this.appName=appName;
+ }
+
+ public String getStoreCode(){
+ return storeCode;
+ }
+
+ public void setStoreCode(String storeCode){
+ this.storeCode=storeCode;
+ }
+
+ public int getLineId(){
+ return lineId;
+ }
+
+ public void setLineId(int lineId){
+ this.lineId=lineId;
+ }
+
+ public String getFirstName(){
+ return firstName;
+ }
+
+ public void setFirstName(String firstName){
+ this.firstName=firstName;
+ }
+
+ public String getLastName(){
+ return lastName;
+ }
+
+ public void setLastName(String lastName){
+ this.lastName=lastName;
+ }
+
+ public String getTimestamp(){
+ return timestamp;
+ }
+
+ public void setTimestamp(String timestamp){
+ this.timestamp=timestamp;
+ }
+
+ public double getPrice(){
+ return price;
+ }
+
+ public void setPrice(double price){
+ this.price=price;
+ }
+
+ public String getDescription(){
+ return description;
+ }
+
+ public void setDescription(String description){
+ this.description=description;
+ }
+
+ // other constructors, parsers, etc.
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/bigtop/blob/d3da8ceb/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/PigCSVCleaner.java
----------------------------------------------------------------------
diff --git a/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/PigCSVCleaner.java b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/PigCSVCleaner.java
new file mode 100644
index 0000000..01ddd6e
--- /dev/null
+++ b/bigtop-bigpetstore/src/main/java/org/apache/bigtop/bigpetstore/etl/PigCSVCleaner.java
@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.bigpetstore.etl;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.bigtop.bigpetstore.util.BigPetStoreConstants;
+import org.apache.bigtop.bigpetstore.util.DeveloperTools;
+import org.apache.bigtop.bigpetstore.util.NumericalIdUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+
+/**
+ * This class operates by ETL'ing the dataset into pig.
+ * The pigServer is persisted through the life of the class, so that the
+ * intermediate data sets created in the constructor can be reused.
+ */
+public class PigCSVCleaner {
+
+ PigServer pigServer;
+
+ public PigCSVCleaner(Path inputPath, Path outputPath, ExecType ex, File... scripts)
+ throws Exception {
+
+
+
+ FileSystem fs = FileSystem.get(inputPath.toUri(), new Configuration());
+
+ if(! fs.exists(inputPath)){
+ throw new RuntimeException("INPUT path DOES NOT exist : " + inputPath);
+ }
+
+ if(fs.exists(outputPath)){
+ throw new RuntimeException("OUTPUT already exists : " + outputPath);
+ }
+ // run pig in local mode
+ pigServer = new PigServer(ex);
+
+ /**
+ * First, split the tabs up.
+ *
+ * BigPetStore,storeCode_OK,2 yang,jay,Mon Dec 15 23:33:49 EST
+ * 1969,69.56,flea collar
+ *
+ * ("BigPetStore,storeCode_OK,2",
+ * "yang,jay,Mon Dec 15 23:33:49 EST 1969,69.56,flea collar")
+ *
+ * BigPetStore,storeCode_AK,1 amanda,fitzgerald,Sat Dec 20 09:44:25 EET
+ * 1969,7.5,cat-food
+ */
+ pigServer.registerQuery("csvdata = LOAD '<i>' AS (ID,DETAILS);"
+ .replaceAll("<i>", inputPath.toString()));
+
+ /**
+ * Now, we want to split the two tab delimited feidls into uniform
+ * fields of comma separated values. To do this, we 1) Internally split
+ * the FIRST and SECOND fields by commas "a,b,c" --> (a,b,c) 2) FLATTEN
+ * the FIRST and SECOND fields. (d,e) (a,b,c) -> d e a b c
+ */
+ pigServer
+ .registerQuery(
+ "id_details = FOREACH csvdata GENERATE "
+ + "FLATTEN" + "(STRSPLIT(ID,',',3)) AS " +
+ "(drop, code, transaction) ,"
+
+ + "FLATTEN" + "(STRSPLIT(DETAILS,',',5)) AS " +
+ "(lname, fname, date, price," +
+ "product:chararray);");
+
+ pigServer.store("id_details", outputPath.toString());
+
+ /**
+ * Now we run scripts... this is where you can add some
+ * arbitrary analytics.
+ *
+ * We add "input" and "output" parameters so that each
+ * script can read them and use them if they want.
+ *
+ * Otherwise, just hardcode your inputs into your pig scripts.
+ */
+ int i = 0;
+ for(File script : scripts) {
+ Map<String,String> parameters = new HashMap<String,String>();
+ parameters.put("input",
+ outputPath.toString());
+
+ Path dir = outputPath.getParent();
+ Path adHocOut=
+ new Path(
+ dir,
+ BigPetStoreConstants.OUTPUTS.pig_ad_hoc_script.name()+(i++));
+ System.out.println("Setting default output to " + adHocOut);
+ parameters.put("output", adHocOut.toString());
+
+ pigServer.registerScript(script.getAbsolutePath(), parameters);
+ }
+ }
+
+ private static File[] files(String[] args,int startIndex) {
+ List<File> files = new ArrayList<File>();
+ for(int i = startIndex ; i < args.length ; i++) {
+ File f = new File(args[i]);
+ if(! f.exists()) {
+ throw new RuntimeException("Pig script arg " + i+ " " + f.getAbsolutePath() + " not found. ");
+ }
+ files.add(f);
+ }
+ System.out.println(
+ "Ad-hoc analytics:"+
+ "Added " + files.size() + " pig scripts to post process. "+
+ "Each one will be given $input and $output arguments.");
+ return files.toArray(new File[]{});
+ }
+ public static void main(final String[] args) throws Exception {
+ System.out.println("Starting pig etl " + args.length);
+
+ Configuration c = new Configuration();
+ int res = ToolRunner.run(
+ c,
+
+ new Tool() {
+ Configuration conf;
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf=conf;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ DeveloperTools.validate(
+ args,
+ "generated data directory",
+ "pig output directory");
+ new PigCSVCleaner(
+ new Path(args[0]),
+ new Path(args[1]),
+ ExecType.MAPREDUCE,
+ files(args,2));
+ return 0;
+ }
+ }, args);
+ System.exit(res);
+ }
+}
\ No newline at end of file