You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ol...@apache.org on 2008/11/14 00:08:22 UTC
svn commit: r713854 [3/3] - in /hadoop/pig/branches/types/tutorial: ./ data/
scripts/ src/ src/org/ src/org/apache/ src/org/apache/pig/
src/org/apache/pig/tutorial/
Propchange: hadoop/pig/branches/types/tutorial/data/excite-small.log
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/pig/branches/types/tutorial/data/excite.log.bz2
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/data/excite.log.bz2?rev=713854&view=auto
==============================================================================
Binary file - no diff available.
Propchange: hadoop/pig/branches/types/tutorial/data/excite.log.bz2
------------------------------------------------------------------------------
svn:executable = *
Propchange: hadoop/pig/branches/types/tutorial/data/excite.log.bz2
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig Thu Nov 13 15:08:21 2008
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Query Phrase Popularity (Hadoop cluster)
+
+-- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day.
+
+
+-- Register the tutorial JAR file so that the included UDFs can be called in the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the raw bag as an array of records.
+-- Input: (user,time,query)
+raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query);
+
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
+
+-- Use the GROUP command to group records by n-gram only.
+-- Each group now corresponds to a distinct n-gram and has the count for each hour.
+uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
+
+-- For each group, identify the hour in which this n-gram is used with a particularly high frequency.
+-- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram.
+uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1));
+
+-- Use the FOREACH-GENERATE command to assign names to the fields.
+uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean;
+
+-- Use the FILTER command to move all records with a score less than or equal to 2.0.
+filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
+
+-- Use the ORDER command to sort the remaining records by hour and score.
+ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
+
+-- Use the PigStorage function to store the results.
+-- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
+STORE ordered_uniq_frequency INTO 'script1-hadoop-results' USING PigStorage();
Propchange: hadoop/pig/branches/types/tutorial/scripts/script1-hadoop.pig
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/pig/branches/types/tutorial/scripts/script1-local.pig
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script1-local.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script1-local.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script1-local.pig Thu Nov 13 15:08:21 2008
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Query Phrase Popularity (local mode)
+
+-- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day.
+
+-- Register the tutorial JAR file so that the included UDFs can be called in the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the raw bag as an array of records.
+-- Input: (user,time,query)
+raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
+
+-- Use the GROUP command to group records by n-gram only.
+-- Each group now corresponds to a distinct n-gram and has the count for each hour.
+uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
+
+-- For each group, identify the hour in which this n-gram is used with a particularly high frequency.
+-- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram.
+uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1));
+
+-- Use the FOREACH-GENERATE command to assign names to the fields.
+uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean;
+
+-- Use the FILTER command to move all records with a score less than or equal to 2.0.
+filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
+
+-- Use the ORDER command to sort the remaining records by hour and score.
+ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
+
+-- Use the PigStorage function to store the results.
+-- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
+STORE ordered_uniq_frequency INTO 'script1-local-results.txt' USING PigStorage();
Propchange: hadoop/pig/branches/types/tutorial/scripts/script1-local.pig
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig Thu Nov 13 15:08:21 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Temporal Query Phrase Popularity (Hadoop cluster)
+
+-- This script processes a search query log file from the Excite search engine and compares the occurrence of frequency of search phrases across two time periods separated by twelve hours.
+
+-- Register the tutorial JAR file so that the included UDFs can be called in the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the raw bag as an array of records.
+-- Input: (user,time,query)
+raw = LOAD 'excite.log.bz2' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
+
+-- Use the FOREACH-GENERATE command to assign names to the fields.
+hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 as count;
+
+-- Use the FILTER command to get the n-grams for hour 00 .
+hour00 = FILTER hour_frequency2 BY hour eq '00';
+
+-- Use the FILTER command to get the n-grams for hour 12
+hour12 = FILTER hour_frequency3 BY hour eq '12';
+
+-- Use the JOIN command to get the n-grams that appear in both hours.
+same = JOIN hour00 BY $0, hour12 BY $0;
+
+-- Use the FOREACH-GENERATE command to record their frequency.
+same1 = FOREACH same GENERATE hour00::group::ngram as ngram, $2 as count00, $5 as count12;
+
+-- Use the PigStorage function to store the results.
+-- Output: (n-gram, count_at_hour_00, count_at_hour_12)
+STORE same1 INTO 'script2-hadoop-results' USING PigStorage();
Propchange: hadoop/pig/branches/types/tutorial/scripts/script2-hadoop.pig
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/pig/branches/types/tutorial/scripts/script2-local.pig
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/scripts/script2-local.pig?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/scripts/script2-local.pig (added)
+++ hadoop/pig/branches/types/tutorial/scripts/script2-local.pig Thu Nov 13 15:08:21 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+-- Temporal Query Phrase Popularity (local mode)
+
+-- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day.
+
+-- Register the tutorial JAR file so that the included UDFs can be called in the script.
+REGISTER ./tutorial.jar;
+
+-- Use the PigStorage function to load the excite log file into the raw bag as an array of records.
+-- Input: (user,time,query)
+raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query);
+
+-- Call the NonURLDetector UDF to remove records if the query field is empty or a URL.
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+
+-- Call the ToLower UDF to change the query field to lowercase.
+clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
+
+-- Because the log file only contains queries for a single day, we are only interested in the hour.
+-- The excite query log timestamp format is YYMMDDHHMMSS.
+-- Call the ExtractHour UDF to extract the hour (HH) from the time field.
+houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+
+-- Call the NGramGenerator UDF to compose the n-grams of the query.
+ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+
+-- Use the DISTINCT command to get the unique n-grams for all records.
+ngramed2 = DISTINCT ngramed1;
+
+-- Use the GROUP command to group records by n-gram and hour.
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+
+-- Use the COUNT function to get the count (occurrences) of each n-gram.
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
+
+-- Use the FOREACH-GENERATE command to assign names to the fields.
+hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 as count;
+
+-- Use the FILTER command to get the n-grams for hour 00 .
+hour00 = FILTER hour_frequency2 BY hour eq '00';
+
+-- Use the FILTER command to get the n-grams for hour 12
+hour12 = FILTER hour_frequency3 BY hour eq '12';
+
+-- Use the JOIN command to get the n-grams that appear in both hours.
+same = JOIN hour00 BY $0, hour12 BY $0;
+
+-- Use the FOREACH-GENERATE command to record their frequency.
+same1 = FOREACH same GENERATE hour00::group::ngram as ngram, $2 as count00, $5 as count12;
+
+-- Use the PigStorage function to store the results.
+-- Output: (n-gram, count_at_hour_00, count_at_hour_12)
+STORE same1 INTO 'script2-local-results.txt' USING PigStorage();
Propchange: hadoop/pig/branches/types/tutorial/scripts/script2-local.pig
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java (added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ExtractHour.java Thu Nov 13 15:08:21 2008
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * The excite query log timestamp format is YYMMDDHHMMSS
+ * This function extracts the hour, HH
+ */
+public class ExtractHour extends EvalFunc<String> {
+ public String exec(Tuple input) throws IOException {
+ if (input == null || input.size() == 0)
+ return null;
+ try{
+ String timestamp = (String)input.get(0);
+ return timestamp.substring(6, 8);
+ }catch(Exception e){
+ System.err.println("ExtractHour: failed to proces input; error - " + e.getMessage());
+ return null;
+ }
+ }
+
+ @Override
+ /**
+ * This method gives a name to the column.
+ * @param input - schema of the input data
+ * @return schema of the ouput data
+ */
+ public Schema outputSchema(Schema input) {
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+ * This is needed to make sure that both bytearrays and chararrays can be passed as arguments
+ */
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+ List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+ funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+ return funcList;
+ }
+}
Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java (added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NGramGenerator.java Thu Nov 13 15:08:21 2008
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DefaultBagFactory;
+import org.apache.pig.data.DefaultTupleFactory;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * This function divides a search query string into wrods and extracts
+ * n-grams with up to _ngramSizeLimit length.
+ * Example 1: if query = "a real nice query" and _ngramSizeLimit = 2,
+ * the query is split into: a, real, nice, query, a real, real nice, nice query
+ * Example 2: if record = (u1, h1, pig hadoop) and _ngramSizeLimit = 2,
+ * the record is split into: (u1, h1, pig), (u1, h1, hadoop), (u1, h1, pig hadoop)
+ */
+public class NGramGenerator extends EvalFunc<DataBag> {
+
+ private static final int _ngramSizeLimit = 2;
+
+ public DataBag exec(Tuple input) throws IOException {
+ if (input == null || input.size() == 0)
+ return null;
+ try{
+ DataBag output = DefaultBagFactory.getInstance().newDefaultBag();
+ String query = (String)input.get(0);
+ String[] words = TutorialUtil.splitToWords(query);
+ Set<String> ngrams = new HashSet<String>();
+ TutorialUtil.makeNGram(words, ngrams, _ngramSizeLimit);
+ for (String ngram : ngrams) {
+ Tuple t = DefaultTupleFactory.getInstance().newTuple(1);
+ t.set(0, ngram);
+ output.add(t);
+ }
+ return output;
+ }catch(Exception e){
+ System.err.println("NGramGenerator: failed to process input; error - " + e.getMessage());
+ return null;
+ }
+ }
+
+ @Override
+ /**
+ * This method gives a name to the column.
+ * @param input - schema of the input data
+ * @return schema of the input data
+ */
+ public Schema outputSchema(Schema input) {
+ Schema bagSchema = new Schema();
+ bagSchema.add(new Schema.FieldSchema("ngram", DataType.CHARARRAY));
+ try{
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
+ bagSchema, DataType.BAG));
+ }catch (FrontendException e){
+ return null;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+ * This is needed to make sure that both bytearrays and chararrays can be passed as arguments
+ */
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+ List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+ funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+ return funcList;
+ }
+
+}
Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java (added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/NonURLDetector.java Thu Nov 13 15:08:21 2008
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.FilterFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * This function removes search queries that are URLs (as defined by _urlPattern).
+ * This function also removes empty queries.
+ */
+public class NonURLDetector extends FilterFunc {
+
+ private Pattern _urlPattern = Pattern.compile("^[\"]?(http[:|;])|(https[:|;])|(www\\.)");
+
+ public Boolean exec(Tuple arg0) throws IOException {
+ if (arg0 == null || arg0.size() == 0)
+ return false;
+
+ String query;
+ try{
+ query = (String)arg0.get(0);
+ if(query == null)
+ return false;
+ query = query.trim();
+ }catch(Exception e){
+ System.err.println("NonURLDetector: failed to process input; error - " + e.getMessage());
+ return false;
+ }
+
+ if (query.equals("")) {
+ return false;
+ }
+ Matcher m = _urlPattern.matcher(query);
+ if (m.find()) {
+ return false;
+ }
+ return true;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+ * This is needed to make sure that both bytearrays and chararrays can be passed as arguments
+ */
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+ List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+ funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+ return funcList;
+ }
+
+}
Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java (added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ScoreGenerator.java Thu Nov 13 15:08:21 2008
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DefaultBagFactory;
+import org.apache.pig.data.DefaultTupleFactory;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * For each n-gram, we have a set of (hour, count) pairs.
+ *
+ * This function reads the set and retains those hours with above
+ * above mean count, and calculates the score of each retained hour as the
+ * multiplier of the count of the hour over the standard deviation.
+ *
+ * A score greater than 1.0 indicates the frequency of this n-gram
+ * in this particular hour is at least one standard deviation away
+ * from the average frequency among all hours
+ */
+
+public class ScoreGenerator extends EvalFunc<DataBag> {
+
+ private static double computeMean(List<Long> counts) {
+ int numCounts = counts.size();
+
+ // compute mean
+ double mean = 0.0;
+ for (Long count : counts) {
+ mean += ((double) count) / ((double) numCounts);
+ }
+
+ return mean;
+ }
+
+ private static double computeSD(List<Long> counts, double mean) {
+ int numCounts = counts.size();
+
+ // compute deviation
+ double deviation = 0.0;
+ for (Long count : counts) {
+ double d = ((double) count) - mean;
+ deviation += d * d / ((double) numCounts);
+ }
+
+ return Math.sqrt(deviation);
+ }
+
+ public DataBag exec(Tuple input) throws IOException {
+ if (input == null || input.size() == 0)
+ return null;
+ try{
+ DataBag output = DefaultBagFactory.getInstance().newDefaultBag();
+ DataBag in = (DataBag)input.get(0);
+
+ Map<String, Long> pairs = new HashMap<String, Long>();
+ List<Long> counts = new ArrayList<Long> ();
+
+ Iterator<Tuple> it = in.iterator();
+ while (it.hasNext()) {
+ Tuple t = it.next();
+ String hour = (String)t.get(1);
+ Long count = (Long)t.get(2);
+ pairs.put(hour, count);
+ counts.add(count);
+ }
+
+ double mean = computeMean(counts);
+ double standardDeviation = computeSD(counts, mean);
+
+ Iterator<String> it2 = pairs.keySet().iterator();
+ while (it2.hasNext()) {
+ String hour = it2.next();
+ Long count = pairs.get(hour);
+ if ( count > mean ) {
+ Tuple t = DefaultTupleFactory.getInstance().newTuple(4);
+ t.set(0, hour);
+ t.set(1, ((double) count - mean) / standardDeviation ); // the score
+ t.set(2, count);
+ t.set(3, mean);
+ output.add(t);
+ }
+ }
+ return output;
+ }catch (Exception e){
+ System.err.println("ScoreGenerator: failed to process input; error - " + e.getMessage());
+ return null;
+ }
+ }
+
+ @Override
+ /**
+ * This method gives a name to the column.
+ * @param input - schema of the input data
+ * @return schema of the output data
+ */
+ public Schema outputSchema(Schema input) {
+ Schema bagSchema = new Schema();
+ bagSchema.add(new Schema.FieldSchema("hour", DataType.CHARARRAY));
+ bagSchema.add(new Schema.FieldSchema("score", DataType.DOUBLE));
+ bagSchema.add(new Schema.FieldSchema("count", DataType.LONG));
+ bagSchema.add(new Schema.FieldSchema("mean", DataType.DOUBLE));
+ //TODO
+ //Here the schema of the bag is the schema of the tuple inside the bag
+ //We need to change this so that the bag has the tuple and the tuple has the schema
+ try{
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), bagSchema, DataType.BAG));
+ }catch (FrontendException e){
+ return null;
+ }
+ }
+
+}
Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java (added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/ToLower.java Thu Nov 13 15:08:21 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * This function converts the input into lowercase and
+ * removes leading and trailing white spaces.
+ */
+public class ToLower extends EvalFunc<String> {
+ public String exec(Tuple input) throws IOException {
+ if(input == null || input.size() == 0)
+ return null;
+ try{
+ String query = (String)input.get(0);
+ return query.toLowerCase().trim();
+ }catch(Exception e){
+ System.err.println("ToLower: failed to process input; error - " + e.getMessage());
+ return null;
+ }
+ }
+
+ @Override
+ /**
+ * This method gives a name to the column.
+ * @param input - schema of the input data
+ * @return schema of the output data
+ */
+ public Schema outputSchema(Schema input) {
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+ * This is needed to make sure that both bytearrays and chararrays can be passed as arguments
+ */
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+ List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+ funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+ return funcList;
+ }
+
+}
Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java (added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialTest.java Thu Nov 13 15:08:21 2008
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FilterFunc;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DefaultBagFactory;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DefaultTupleFactory;
+
+public class TutorialTest {
+
+ private static Tuple[] getTuples(String[] queries) {
+ Tuple[] tuples = new Tuple[queries.length];
+ for (int i = 0; i < tuples.length; i++) {
+ tuples[i] = DefaultTupleFactory.getInstance().newTuple(1);
+ try{tuples[i].set(0, queries[i]);}catch(Exception e){}
+ }
+ return tuples;
+ }
+
+ public static String[] testDataAtomEvals(EvalFunc<String> eval, Tuple[] tuples) {
+ List<String> res = new ArrayList<String>();
+ try {
+ for (Tuple t : tuples) {
+ String output = eval.exec(t);
+ System.out.println("Converted: " + t + " to (" + output + ")");
+ res.add(output);
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ System.out.println("===");
+ return res.toArray(new String[res.size()]);
+ }
+
+ public static DataBag[] testDataBagEvals(EvalFunc<DataBag> eval, Tuple[] tuples) {
+ List<DataBag> res = new ArrayList<DataBag>();
+ try {
+ for (Tuple t : tuples) {
+ DataBag output = eval.exec(t);
+ System.out.println("Converted: " + t + " to (" + output + ")");
+ res.add(output);
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ System.out.println("===");
+ return res.toArray(new DataBag[res.size()]);
+ }
+
+ public static String[] testFilters (FilterFunc filter, Tuple[] tuples) {
+ List<String> res = new ArrayList<String>();
+ try {
+ for (Tuple t : tuples) {
+ if (filter.exec(t)) {
+ System.out.println("accepted: " + t);
+ res.add((String)t.get(0));
+ } else {
+ System.out.println("rejected: " + t);
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ System.out.println("===");
+ return res.toArray(new String[res.size()]);
+ }
+
+ public static void main(String[] args) {
+ String[] queries = {
+ "http://www.yahoo.com/",
+ "\"http://www.yahoo.com/\"",
+ " http;//www.yahoo.com/ ",
+ "https://www.yahoo.com/",
+ "www.yahoo.com/",
+ "\"www.yahoo.com/\"",
+ "a real nice query ",
+ "an UPPER CASE query",
+ " ",
+ " nude picture",
+ " +XXX",
+ "\" +porno \"",
+ };
+
+ NonURLDetector filter1 = new NonURLDetector();
+ String[] q1 = testFilters(filter1, getTuples(queries));
+
+ ToLower eval1 = new ToLower();
+ String[] q2 = testDataAtomEvals(eval1, getTuples(q1));
+
+ String[] timestamps = {
+ "970916072134",
+ "970916072311",
+ "970916123431",
+ };
+
+ ExtractHour eval2 = new ExtractHour();
+ testDataAtomEvals(eval2, getTuples(timestamps));
+
+ DataBag bag = DefaultBagFactory.getInstance().newDefaultBag();
+
+ Tuple t1 = DefaultTupleFactory.getInstance().newTuple(3);
+ try{
+ t1.set(0, "word");
+ t1.set(1, "02");
+ t1.set(2, 2);
+ }catch(Exception e){}
+ bag.add(t1);
+
+ Tuple t2 = DefaultTupleFactory.getInstance().newTuple(3);
+ try{
+ t2.set(0, "word");
+ t2.set(1, "05");
+ t2.set(2, 2);
+ }catch(Exception e){}
+ bag.add(t2);
+
+ Tuple t3 = DefaultTupleFactory.getInstance().newTuple(3);
+ try{
+ t3.set(0, "word");
+ t3.set(1, "04");
+ t3.set(2, 3);
+ }catch(Exception e){}
+ bag.add(t3);
+
+ Tuple t4 = DefaultTupleFactory.getInstance().newTuple(3);
+ try{
+ t4.set(0, "word");
+ t4.set(1, "06");
+ t4.set(2, 4);
+ }catch(Exception e){}
+ bag.add(t4);
+
+ Tuple[] t = new Tuple[1];
+ t[0] = DefaultTupleFactory.getInstance().newTuple(1);
+ try{
+ t[0].set(0, bag);
+ }catch(Exception e){}
+
+ ScoreGenerator eval4 = new ScoreGenerator();
+ testDataBagEvals(eval4, t);
+ }
+}
Added: hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java?rev=713854&view=auto
==============================================================================
--- hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java (added)
+++ hadoop/pig/branches/types/tutorial/src/org/apache/pig/tutorial/TutorialUtil.java Thu Nov 13 15:08:21 2008
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.tutorial;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+public class TutorialUtil {
+
+ /**
+ * This function splits a search query string into a set
+ * of non-empty words
+ */
+ protected static String[] splitToWords(String query) {
+ List<String> res = new LinkedList<String>();
+ String[] words = query.split("\\W");
+ for (String word : words) {
+ if (!word.equals("")) {
+ res.add(word);
+ }
+ }
+ return res.toArray(new String[res.size()]);
+ }
+
+ /**
+ * This is a simple utility function that make word-level
+ * ngrams from a set of words
+ * @param words
+ * @param ngrams
+ * @param size
+ */
+ protected static void makeNGram(String[] words, Set<String> ngrams, int size) {
+ int stop = words.length - size + 1;
+ for (int i = 0; i < stop; i++) {
+ StringBuilder sb = new StringBuilder();
+ for (int j = 0; j < size; j++) {
+ sb.append(words[i + j]).append(" ");
+ }
+ sb.deleteCharAt(sb.length() - 1);
+ ngrams.add(sb.toString());
+ }
+ if (size > 1) {
+ makeNGram(words, ngrams, size - 1);
+ }
+ }
+
+}