You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2014/01/31 05:17:34 UTC
[1/4] git commit: DATAFU-8: Port OpenNLP to DataFu
Updated Branches:
refs/heads/master 9a4264504 -> d8cec6a5c
DATAFU-8: Port OpenNLP to DataFu
https://issues.apache.org/jira/browse/DATAFU-8
Signed-off-by: Matt Hayes <mh...@linkedin.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/99e46e2c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/99e46e2c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/99e46e2c
Branch: refs/heads/master
Commit: 99e46e2cd80be6d7cecf56308b17a707d61d6660
Parents: 41a0c2c
Author: Russell Jurney <ru...@gmail.com>
Authored: Wed Jan 29 15:04:41 2014 -0800
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Wed Jan 29 15:04:57 2014 -0800
----------------------------------------------------------------------
.gitignore | 4 +
.travis.yml | 1 +
build.xml | 16 +-
ivy.xml | 7 +-
ivy/libraries.properties | 2 +
.../datafu/pig/text/opennlp/CachedFile.java | 41 ++++
src/java/datafu/pig/text/opennlp/POSTag.java | 177 +++++++++++++++++
.../datafu/pig/text/opennlp/SentenceDetect.java | 122 ++++++++++++
.../datafu/pig/text/opennlp/TokenizeME.java | 126 ++++++++++++
.../datafu/pig/text/opennlp/TokenizeSimple.java | 101 ++++++++++
.../pig/text/opennlp/TokenizeWhitespace.java | 102 ++++++++++
test/pig/datafu/test/pig/text/NLPTests.java | 195 +++++++++++++++++++
12 files changed, 889 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index fd6a420..81e9ae8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,7 @@ pigunit-input-overriden.txt
*.asc
/bin/
.ant-targets-build.xml
+/data/
+*.iml
+.idea
+.DS_Store
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index 8fd4bc7..669b97a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,5 +12,6 @@ env:
- TESTFOLDER=stats
- TESTFOLDER=urls
- TESTFOLDER=util
+ - TESTFOLDER=text
jdk:
- openjdk6
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 8be0cbf..e04bc4e 100644
--- a/build.xml
+++ b/build.xml
@@ -27,6 +27,7 @@
<property name="test.dir" value="${basedir}/test" />
<property name="pigtestsrc.dir" value="${test.dir}/pig" />
<property name="dist.dir" value="${basedir}/dist" />
+ <property name="data.dir" value="${basedir}/data" />
<property name="tools.dir" value="${basedir}/tools" />
<property name="lib.dir" value="${basedir}/lib" />
<property name="common.lib.dir" value="${lib.dir}/common" />
@@ -49,6 +50,7 @@
<property name="commons-math.jar" value="commons-math-jar-${commons-math.version}.jar" />
<property name="guava.jar" value="guava-jar-${guava.version}.jar" />
<property name="stream.jar" value="stream-jar-${stream.version}.jar" />
+ <property name="opennlp.jar" value="opennlp-tools-bundle-${opennlp.version}.jar" />
<!-- Java configuration -->
<property name="targetJavaVersion" value="1.5" />
@@ -109,6 +111,13 @@
<mkdir dir="${ivy.jar.dir}"/>
<get src="${maven.jar.repo.url}" dest="${maven.jar}" usetimestamp="true"/>
</target>
+
+ <target name="opennlp-model-download" description="Download OpenNLP models">
+ <mkdir dir="${data.dir}"/>
+ <get src="http://opennlp.sourceforge.net/models-1.5/en-pos-maxent.bin" dest="${data.dir}/en-pos-maxent.bin" usetimestamp="true"/>
+ <get src="http://opennlp.sourceforge.net/models-1.5/en-sent.bin" dest="${data.dir}/en-sent.bin" usetimestamp="true"/>
+ <get src="http://opennlp.sourceforge.net/models-1.5/en-token.bin" dest="${data.dir}/en-token.bin" usetimestamp="true"/>
+ </target>
<target name="maven-taskdef" depends="maven-ant-tasks-jar-download">
<path id="mvn-ant-task.classpath" path="${maven.jar}"/>
@@ -346,7 +355,7 @@
<delete file="${dist.dir}/${final.name}-orig.jar" />
<move file="${dist.dir}/${final.name}.jar" tofile="${dist.dir}/${final.name}-orig.jar" />
<java jar="${tools.dir}/autojar.jar" fork="true">
- <arg line="-baeq -o ${dist.dir}/${final.name}.jar -c ${packaged.lib.dir}/${fastutil.jar}:${packaged.lib.dir}/${commons-math.jar}:${packaged.lib.dir}/${stream.jar}:${packaged.lib.dir}/${guava.jar} ${dist.dir}/${final.name}-orig.jar" />
+ <arg line="-baeq -o ${dist.dir}/${final.name}.jar -c ${packaged.lib.dir}/*.jar ${dist.dir}/${final.name}-orig.jar" />
</java>
<delete file="${dist.dir}/${final.name}-orig.jar" />
@@ -360,6 +369,7 @@
<rule pattern="org.apache.commons.math.**" result="datafu.org.apache.commons.math.@1"/>
<rule pattern="com.clearspring.analytics.**" result="datafu.com.clearspring.analytics.@1"/>
<rule pattern="com.google.common.**" result="datafu.com.google.common.@1"/>
+ <rule pattern="opennlp.**" result="datafu.opennlp.@1"/>
</jarjar>
<delete file="${dist.dir}/${final.name}-orig.jar" />
</target>
@@ -388,7 +398,7 @@
</jarjar>
</target>
- <target name="test" depends="build-pig-tests, jar" description="Runs the pig tests">
+ <target name="test" depends="build-pig-tests, jar, opennlp-model-download" description="Runs the pig tests">
<taskdef resource="testngtasks" classpath="${tools.lib.dir}/testng-jar-${testng.version}.jar"/>
<testng classpathref="run-tests-classpath" methods="${test.methods}"
outputDir="${report.dir}" verbose="2" haltonfailure="true" haltonskipped="true">
@@ -398,7 +408,7 @@
</testng>
</target>
- <target name="test-instrumented" depends="build-pig-tests, jar-instrumented" description="Runs the tests with instrumented JARs">
+ <target name="test-instrumented" depends="build-pig-tests, jar-instrumented, opennlp-model-download" description="Runs the tests with instrumented JARs">
<taskdef resource="testngtasks" classpath="${tools.lib.dir}/testng-jar-${testng.version}.jar"/>
<testng classpathref="instrumented-test-classpath" methods="${test.methods}"
outputDir="${report.dir}" haltonfailure="true" haltonskipped="true">
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy.xml b/ivy.xml
index 28a99fe..f1f90fa 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -19,10 +19,13 @@
<!-- don't include fastutil, as we will include it below and don't want it in the common directory -->
<exclude org="it.unimi.dsi" name="fastutil" />
</dependency>
- <dependency org="com.google.guava" name="guava" rev="${guava.version}" conf="packaged->default"/>
+ <dependency org="com.google.guava" name="guava" rev="${guava.version}" conf="packaged->default"/>
+ <dependency org="org.apache.opennlp" name="opennlp-tools" rev="${opennlp.version}" conf="packaged->default"/>
+ <dependency org="org.apache.opennlp" name="opennlp-uima" rev="${opennlp.version}" conf="packaged->default"/>
+ <dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${opennlp-maxent.version}" conf="packaged->default"/>
<!-- hadoop and pig dependencies required for building but which are not included in the pom because
- we don't want to require a specific version -->
+we don't want to require a specific version -->
<dependency org="org.apache.pig" name="pig" rev="${pig.version}" conf="hadoop->default"/>
<dependency org="org.apache.hadoop" name="hadoop-core" rev="${hadoop.version}" conf="hadoop->default"/>
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/ivy/libraries.properties
----------------------------------------------------------------------
diff --git a/ivy/libraries.properties b/ivy/libraries.properties
index d296101..55a217e 100644
--- a/ivy/libraries.properties
+++ b/ivy/libraries.properties
@@ -14,3 +14,5 @@ pig.version=0.11.1
testng.version=6.2
tools.version=1.4.2
wagon-http.version=1.0-beta-2
+opennlp.version=1.5.3
+opennlp-maxent.version=3.0.3
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/CachedFile.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/CachedFile.java b/src/java/datafu/pig/text/opennlp/CachedFile.java
new file mode 100644
index 0000000..5832c81
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/CachedFile.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public class CachedFile {
+
+ public static String getFileName(String modelLink, String modelFile) throws IOException {
+ // if the symlink exists, use it, if not, use the raw name if it exists
+ // note: this is to help with testing, as it seems distributed cache doesn't work with PigUnit
+ String loadFile = modelFile;
+ if (!new File(loadFile).exists()) {
+ if (new File(modelLink).exists()) {
+ loadFile = modelLink;
+ } else {
+ throw new IOException(String.format("Could not load model, neither symlink %s nor file %s exist", modelFile, modelLink));
+ }
+ }
+ return loadFile;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/POSTag.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/POSTag.java b/src/java/datafu/pig/text/opennlp/POSTag.java
new file mode 100644
index 0000000..fb17c63
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/POSTag.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTaggerME;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP POSTag UDF tags bags of sequential words with parts of speech and confidence levels using the OpenNLP
+ * toolset, and specifically the POSTaggerME class.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+ * define POSTag datafu.pig.text.opennlp.POSTag('data/en-pos-maxent.bin');
+ *
+ * -- input:
+ * -- (Appetizers during happy hour range from low to high.)
+ * input = LOAD 'input' AS (text:chararray);
+ * --
+ * -- ({(Appetizers),(during),(happy),(hour),(range),(from),(low),(to),(high),(.)})
+ * tokenized = FOREACH input GENERATE TokenizeME(text) AS tokens;
+ * --
+ * -- output:
+ * -- Tuple schema is: (word, tag, confidence)
+ * -- ({(Appetizers,NNP,0.3619277937390988),(during,IN,0.7945543860326094),(happy,JJ,0.9888504792754391),
+ * -- (hour,NN,0.9427455123502427),(range,NN,0.7335527963654751),(from,IN,0.9911576465589752),(low,JJ,0.9652034031895174),
+ * -- (to,IN,0.7005347487371849),(high,JJ,0.8227771746247106),(.,.,0.9900983495480891)})
+ * output = FOREACH tokenized GENERATE POSTag(tokens) AS tagged;
+ * }
+ * </pre>
+ */
+public class POSTag extends EvalFunc<DataBag>
+{
+ private POSTaggerME tagger = null;
+ private static final String MODEL_FILE = "pos";
+ private TupleFactory tf = TupleFactory.getInstance();
+ private BagFactory bf = BagFactory.getInstance();
+ private String modelPath;
+
+ public POSTag(String modelPath) {
+ this.modelPath = modelPath;
+ }
+
+ @Override
+ public List<String> getCacheFiles() {
+ List<String> list = new ArrayList<String>(1);
+ list.add(this.modelPath + "#" + MODEL_FILE);
+ return list;
+ }
+
+ // Enable multiple languages by specifying the model path. See http://text.sourceforge.net/models-1.5/
+ public DataBag exec(Tuple input) throws IOException
+ {
+ DataBag inputBag = null;
+
+ if(input.size() != 1) {
+ throw new IOException();
+ }
+
+ inputBag = (DataBag)input.get(0);
+ DataBag outBag = bf.newDefaultBag();
+ if(this.tagger == null) {
+ String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);
+ InputStream modelIn = new FileInputStream(loadFile);
+ InputStream buffer = new BufferedInputStream(modelIn);
+ POSModel model = new POSModel(buffer);
+ this.tagger = new POSTaggerME(model);
+ }
+
+ // Form an inputString array thing for tagger to act on
+ int bagLength = (int)inputBag.size();
+ String[] words = new String[bagLength];
+
+ Iterator<Tuple> itr = inputBag.iterator();
+ int i = 0;
+ while(itr.hasNext()) {
+ words[i] = (String)itr.next().get(0);
+ i++;
+ }
+
+ // Compute tags and their probabilities
+ String tags[] = this.tagger.tag(words);
+ double probs[] = this.tagger.probs();
+
+ // Build output bag of 3-tuples
+ for(int j = 0; j < tags.length; j++) {
+ Tuple newTuple = tf.newTuple(3);
+ newTuple.set(0, words[j]);
+ newTuple.set(1, tags[j]);
+ newTuple.set(2, probs[j]);
+ outBag.add(newTuple);
+ }
+
+ return outBag;
+ }
+
+ @Override
+ public Schema outputSchema(Schema input)
+ {
+ try
+ {
+ Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+ if (inputFieldSchema.type != DataType.BAG)
+ {
+ throw new RuntimeException("Expected a BAG as input");
+ }
+
+ Schema inputBagSchema = inputFieldSchema.schema;
+
+ if(inputBagSchema == null) {
+ return null;
+ }
+
+ if (inputBagSchema.getField(0).type != DataType.TUPLE)
+ {
+ throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s",
+ DataType.findTypeName(inputBagSchema.getField(0).type)));
+ }
+
+ Schema inputTupleSchema = inputBagSchema.getField(0).schema;
+
+ if (inputTupleSchema.size() != 1)
+ {
+ throw new RuntimeException("Expected one field for the token data");
+ }
+
+ if (inputTupleSchema.getField(0).type != DataType.CHARARRAY)
+ {
+ throw new RuntimeException(String.format("Expected source to be a CHARARRAY, but instead found %s",
+ DataType.findTypeName(inputTupleSchema.getField(0).type)));
+ }
+
+ Schema tupleSchema = new Schema();
+ tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+ tupleSchema.add(new Schema.FieldSchema("tag",DataType.CHARARRAY));
+ tupleSchema.add(new Schema.FieldSchema("probability",DataType.DOUBLE));
+
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+ .getName()
+ .toLowerCase(), input),
+ tupleSchema,
+ DataType.BAG));
+ }
+ catch (FrontendException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/SentenceDetect.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/SentenceDetect.java b/src/java/datafu/pig/text/opennlp/SentenceDetect.java
new file mode 100644
index 0000000..50537fd
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/SentenceDetect.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP SentenceDectectors segment an input paragraph into sentences.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define SentenceDetect datafu.pig.text.SentenceDetect('data/en-sent.bin');
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC. I believe laser beams control cat brains.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I believe the Masons have infiltrated the Apache PMC.)(I believe laser beams control cat brains.)})
+ * outfoo = FOREACH input GENERATE SentenceDetect(text) as sentences;
+ * }
+ * </pre>
+ */
+public class SentenceDetect extends EvalFunc<DataBag>
+{
+ private SentenceDetectorME sdetector = null;
+ private static final String MODEL_FILE = "sentences";
+ private TupleFactory tf = TupleFactory.getInstance();
+ private BagFactory bf = BagFactory.getInstance();
+ private String modelPath = null;
+
+ public SentenceDetect(String modelPath) {
+ this.modelPath = modelPath;
+ }
+
+ @Override
+ public List<String> getCacheFiles() {
+ List<String> list = new ArrayList<String>(1);
+ list.add(this.modelPath + "#" + MODEL_FILE);
+ return list;
+ }
+
+ // Enable multiple languages by specifying the model path. See http://text.sourceforge.net/models-1.5/
+ public DataBag exec(Tuple input) throws IOException
+ {
+ if(input.size() != 1) {
+ throw new IOException();
+ }
+
+ String inputString = input.get(0).toString();
+ if(inputString == null || inputString == "") {
+ return null;
+ }
+ DataBag outBag = bf.newDefaultBag();
+ if(sdetector == null) {
+ String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);
+ InputStream is = new FileInputStream(modelPath);
+ InputStream buffer = new BufferedInputStream(is);
+ SentenceModel model = new SentenceModel(buffer);
+ this.sdetector = new SentenceDetectorME(model);
+ }
+ String sentences[] = this.sdetector.sentDetect(inputString);
+ for(String sentence : sentences) {
+ Tuple outTuple = tf.newTuple(sentence);
+ outBag.add(outTuple);
+ }
+ return outBag;
+ }
+
+ @Override
+ public Schema outputSchema(Schema input)
+ {
+ try
+ {
+ Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+ if (inputFieldSchema.type != DataType.CHARARRAY)
+ {
+ throw new RuntimeException("Expected a CHARARRAY as input, but got a " + inputFieldSchema.toString());
+ }
+
+ Schema tupleSchema = new Schema();
+ tupleSchema.add(new Schema.FieldSchema("sentence",DataType.CHARARRAY));
+
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+ .getName()
+ .toLowerCase(), input),
+ tupleSchema,
+ DataType.BAG));
+ }
+ catch (FrontendException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/TokenizeME.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/TokenizeME.java b/src/java/datafu/pig/text/opennlp/TokenizeME.java
new file mode 100644
index 0000000..f1f4257
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/TokenizeME.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP Tokenizers segment an input character sequence into tokens using the OpenNLP TokenizeME class, which is
+ * a probabilistic, 'maximum entropy' classifier.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I),(believe),(the),(Masons),(have),(infiltrated),(the),(Apache),(PMC),(.)})
+ * outfoo = FOREACH input GENERATE TokenizeME(text) as tokens;
+ * }
+ * </pre>
+ */
+
+
+
+public class TokenizeME extends EvalFunc<DataBag>
+{
+ private TokenizerME tokenizer = null;
+ private static final String MODEL_FILE = "tokens";
+ private TupleFactory tf = TupleFactory.getInstance();
+ private BagFactory bf = BagFactory.getInstance();
+ private String modelPath;
+
+ public TokenizeME(String modelPath) {
+ this.modelPath = modelPath;
+ }
+
+ @Override
+ public List<String> getCacheFiles() {
+ List<String> list = new ArrayList<String>(1);
+ list.add(this.modelPath + "#" + MODEL_FILE);
+ return list;
+ }
+
+ // Enable multiple languages by specifying the model path. See http://text.sourceforge.net/models-1.5/
+ public DataBag exec(Tuple input) throws IOException
+ {
+ if(input.size() != 1) {
+ throw new IOException();
+ }
+
+ String inputString = input.get(0).toString();
+ if(inputString == null || inputString == "") {
+ return null;
+ }
+ DataBag outBag = bf.newDefaultBag();
+ if(this.tokenizer == null) {
+ String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);;
+ InputStream file = new FileInputStream(loadFile);
+ InputStream buffer = new BufferedInputStream(file);
+ TokenizerModel model = new TokenizerModel(buffer);
+ this.tokenizer = new TokenizerME(model);
+ }
+ String tokens[] = this.tokenizer.tokenize(inputString);
+ for(String token : tokens) {
+ Tuple outTuple = tf.newTuple(token);
+ outBag.add(outTuple);
+ }
+ return outBag;
+ }
+
+ @Override
+ public Schema outputSchema(Schema input)
+ {
+ try
+ {
+ Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+ if (inputFieldSchema.type != DataType.CHARARRAY)
+ {
+ throw new RuntimeException("Expected a CHARARRAY as input, but got a " + inputFieldSchema.toString());
+ }
+
+ Schema tupleSchema = new Schema();
+ tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+ .getName()
+ .toLowerCase(), input),
+ tupleSchema,
+ DataType.BAG));
+ }
+ catch (FrontendException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/TokenizeSimple.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/TokenizeSimple.java b/src/java/datafu/pig/text/opennlp/TokenizeSimple.java
new file mode 100644
index 0000000..cea48b4
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/TokenizeSimple.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.tokenize.SimpleTokenizer;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP Tokenizers segment an input character sequence into tokens. This one uses the OpenNLP class SimpleTokenizer
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeSimple datafu.pig.text.opennlp.TokenizeSimple();
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I),(believe),(the),(Masons),(have),(infiltrated),(the),(Apache),(PMC),(.)})
+ * outfoo = FOREACH input GENERATE TokenizeSimple(text) as tokens;
+ * }
+ * </pre>
+ */
+public class TokenizeSimple extends EvalFunc<DataBag>
+{
+ private SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ private TupleFactory tf = TupleFactory.getInstance();
+ private BagFactory bf = BagFactory.getInstance();
+
+ public DataBag exec(Tuple input) throws IOException
+ {
+ if(input.size() != 1) {
+ throw new IOException();
+ }
+
+ String inputString = input.get(0).toString();
+ if(inputString == null || inputString == "") {
+ return null;
+ }
+
+ DataBag outBag = bf.newDefaultBag();
+ String tokens[] = tokenizer.tokenize(inputString);
+ for(String token : tokens) {
+ Tuple outTuple = tf.newTuple(token);
+ outBag.add(outTuple);
+ }
+ return outBag;
+ }
+
+ @Override
+ public Schema outputSchema(Schema input)
+ {
+ try
+ {
+ Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+ if (inputFieldSchema.type != DataType.CHARARRAY)
+ {
+ throw new RuntimeException("Expected a CHARARRAY as input, but got a " + inputFieldSchema.toString());
+ }
+
+ Schema tupleSchema = new Schema();
+ tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+ .getName()
+ .toLowerCase(), input),
+ tupleSchema,
+ DataType.BAG));
+ }
+ catch (FrontendException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java b/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java
new file mode 100644
index 0000000..8efafb0
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP Tokenizers segment an input character sequence into tokens. This one uses the OpenNLP class
+ * WhitespaceTokenizer.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeWhitespace datafu.pig.text.opennlp.TokenizeWhitespace();
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I),(believe),(the),(Masons),(have),(infiltrated),(the),(Apache),(PMC),(.)})
+ * outfoo = FOREACH input GENERATE TokenizeWhitespace(text) as tokens;
+ * }
+ * </pre>
+ */
+public class TokenizeWhitespace extends EvalFunc<DataBag>
+{
+ private WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+ private TupleFactory tf = TupleFactory.getInstance();
+ private BagFactory bf = BagFactory.getInstance();
+
+ public DataBag exec(Tuple input) throws IOException
+ {
+ if(input.size() != 1) {
+ throw new IOException();
+ }
+
+ String inputString = input.get(0).toString();
+ if(inputString == null || inputString == "") {
+ return null;
+ }
+
+ DataBag outBag = bf.newDefaultBag();
+ String tokens[] = tokenizer.tokenize(inputString);
+ for(String token : tokens) {
+ Tuple outTuple = tf.newTuple(token);
+ outBag.add(outTuple);
+ }
+ return outBag;
+ }
+
+ @Override
+ public Schema outputSchema(Schema input)
+ {
+ try
+ {
+ Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+ if (inputFieldSchema.type != DataType.CHARARRAY)
+ {
+ throw new RuntimeException("Expected a CHARARRAY as input, but got a " + inputFieldSchema.toString());
+ }
+
+ Schema tupleSchema = new Schema();
+ tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+ .getName()
+ .toLowerCase(), input),
+ tupleSchema,
+ DataType.BAG));
+ }
+ catch (FrontendException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/test/pig/datafu/test/pig/text/NLPTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/text/NLPTests.java b/test/pig/datafu/test/pig/text/NLPTests.java
new file mode 100644
index 0000000..372b17d
--- /dev/null
+++ b/test/pig/datafu/test/pig/text/NLPTests.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.text;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+
+public class NLPTests extends PigTests
+{
+ /**
+ register $JAR_PATH
+
+ define SentenceDetect datafu.pig.text.opennlp.SentenceDetect('data/en-sent.bin');
+
+ data = LOAD 'input' AS (text: chararray);
+
+ dump data;
+
+ data2 = FOREACH data GENERATE SentenceDetect(text) AS sentences;
+
+ dump data2;
+
+ STORE data2 INTO 'output';
+ */
+ @Multiline
+ private String sentenceDetectTest;
+
+ @Test
+ public void sentenceDetectTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(sentenceDetectTest);
+
+ writeLinesToFile("input",
+ "This is a sentence. This is another sentence.",
+ "Yet another sentence. One more just for luck.");
+
+ assertOutput(test, "data2",
+ "({(This is a sentence.),(This is another sentence.)})",
+ "({(Yet another sentence.),(One more just for luck.)})");
+ }
+
+ /**
+ register $JAR_PATH
+
+ define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+
+ data = LOAD 'input' AS (text: chararray);
+
+ dump data;
+
+ data2 = FOREACH data GENERATE TokenizeME(text) AS tokens;
+
+ dump data2;
+
+ STORE data2 INTO 'output';
+ */
+ @Multiline
+ private String tokenizeMETest;
+
+ @Test
+ public void tokenizeMETest() throws Exception
+ {
+ PigTest test = createPigTestFromString(tokenizeMETest);
+
+ writeLinesToFile("input",
+ "This is a sentence. This is another sentence.",
+ "Yet another sentence. One more just for luck.");
+
+ assertOutput(test, "data2",
+ "({(This),(is),(a),(sentence),(.),(This),(is),(another),(sentence),(.)})",
+ "({(Yet),(another),(sentence),(.),(One),(more),(just),(for),(luck),(.)})");
+ }
+
+ /**
+ register $JAR_PATH
+
+ define TokenizeSimple datafu.pig.text.opennlp.TokenizeSimple();
+
+ data = LOAD 'input' AS (text: chararray);
+
+ dump data;
+
+ data2 = FOREACH data GENERATE TokenizeSimple(text) AS tokens;
+
+ dump data2;
+
+ STORE data2 INTO 'output';
+ */
+ @Multiline
+ private String tokenizeSimpleTest;
+
+ @Test
+ public void tokenizeSimpleTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(tokenizeSimpleTest);
+
+ writeLinesToFile("input",
+ "This is a sentence. This is another sentence.",
+ "Yet another sentence. One more just for luck.");
+
+ assertOutput(test, "data2",
+ "({(This),(is),(a),(sentence),(.),(This),(is),(another),(sentence),(.)})",
+ "({(Yet),(another),(sentence),(.),(One),(more),(just),(for),(luck),(.)})");
+ }
+
+ /**
+ register $JAR_PATH
+
+ define TokenizeWhitespace datafu.pig.text.opennlp.TokenizeWhitespace();
+
+ data = LOAD 'input' AS (text: chararray);
+
+ dump data;
+
+ data2 = FOREACH data GENERATE TokenizeWhitespace(text) AS tokens;
+
+ dump data2;
+
+ STORE data2 INTO 'output';
+ */
+ @Multiline
+ private String tokenizeWhitespaceTest;
+
+ @Test
+ public void tokenizeWhitespaceTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(tokenizeWhitespaceTest);
+
+ writeLinesToFile("input",
+ "This is a sentence. This is another sentence.",
+ "Yet another sentence. One more just for luck.");
+
+ assertOutput(test, "data2",
+ "({(This),(is),(a),(sentence.),(This),(is),(another),(sentence.)})",
+ "({(Yet),(another),(sentence.),(One),(more),(just),(for),(luck.)})");
+ }
+
+ /**
+ register $JAR_PATH
+
+ define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+ define POSTag datafu.pig.text.opennlp.POSTag('data/en-pos-maxent.bin');
+
+ data = LOAD 'input' AS (text: chararray);
+
+ dump data;
+
+ data2 = FOREACH data GENERATE TokenizeME(text) AS tokens;
+
+ dump data2;
+
+ data3 = FOREACH data2 GENERATE POSTag(tokens) as tagged;
+
+ dump data3
+
+ STORE data3 INTO 'output';
+ */
+ @Multiline
+ private String POSTagTest;
+
+ @Test
+ public void POSTagTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(POSTagTest);
+
+ writeLinesToFile("input",
+ "This is a sentence. This is another sentence.",
+ "Yet another sentence. One more just for luck.");
+
+ assertOutput(test, "data3",
+ "({(This,DT,0.9649410482478001),(is,VBZ,0.9982592902509803),(a,DT,0.9967282012835504),(sentence,NN,0.9772619256460584),(.,.,0.4391067883074289),(This,DT,0.8346710130761914),(is,VBZ,0.9928885242823617),(another,DT,0.9761159923140399),(sentence,NN,0.9964463493238542),(.,.,0.9856037689871404)})",
+ "({(Yet,RB,0.7638997090011364),(another,DT,0.9657669183153523),(sentence,NN,0.989193114719676),(.,.,0.20091718589945456),(One,CD,0.9229251494813668),(more,JJR,0.9360382000551335),(just,RB,0.8646324491545225),(for,IN,0.9851765355889605),(luck,NN,0.9883408827371651),(.,.,0.9746378518791978)})");
+ }
+}
[4/4] git commit: Merge with master
Posted by mh...@apache.org.
Merge with master
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/d8cec6a5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/d8cec6a5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/d8cec6a5
Branch: refs/heads/master
Commit: d8cec6a5c84d9c0cf545c410eb8a0cf1bc427a22
Parents: 90c754e 0fda7e9
Author: Matt Hayes <mh...@linkedin.com>
Authored: Thu Jan 30 20:16:10 2014 -0800
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Thu Jan 30 20:16:10 2014 -0800
----------------------------------------------------------------------
.gitignore | 4 +
.travis.yml | 1 +
README.md | 3 +-
build.xml | 16 +-
ivy.xml | 7 +-
ivy/libraries.properties | 2 +
.../datafu/pig/text/opennlp/CachedFile.java | 41 ++++
src/java/datafu/pig/text/opennlp/POSTag.java | 177 +++++++++++++++++
.../datafu/pig/text/opennlp/SentenceDetect.java | 122 ++++++++++++
.../datafu/pig/text/opennlp/TokenizeME.java | 126 ++++++++++++
.../datafu/pig/text/opennlp/TokenizeSimple.java | 101 ++++++++++
.../pig/text/opennlp/TokenizeWhitespace.java | 102 ++++++++++
test/pig/datafu/test/pig/text/NLPTests.java | 195 +++++++++++++++++++
13 files changed, 890 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
[3/4] git commit: Merge with master
Posted by mh...@apache.org.
Merge with master
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/0fda7e95
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/0fda7e95
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/0fda7e95
Branch: refs/heads/master
Commit: 0fda7e952f0c06e3166b1aa0059f8e80c2e035a2
Parents: 9a42645 99e46e2
Author: Matt Hayes <mh...@linkedin.com>
Authored: Thu Jan 30 20:14:42 2014 -0800
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Thu Jan 30 20:14:42 2014 -0800
----------------------------------------------------------------------
.gitignore | 4 +
.travis.yml | 1 +
build.xml | 16 +-
ivy.xml | 7 +-
ivy/libraries.properties | 2 +
.../datafu/pig/text/opennlp/CachedFile.java | 41 ++++
src/java/datafu/pig/text/opennlp/POSTag.java | 177 +++++++++++++++++
.../datafu/pig/text/opennlp/SentenceDetect.java | 122 ++++++++++++
.../datafu/pig/text/opennlp/TokenizeME.java | 126 ++++++++++++
.../datafu/pig/text/opennlp/TokenizeSimple.java | 101 ++++++++++
.../pig/text/opennlp/TokenizeWhitespace.java | 102 ++++++++++
test/pig/datafu/test/pig/text/NLPTests.java | 195 +++++++++++++++++++
12 files changed, 889 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
[2/4] git commit: DATAFU-18: Create datafu.util.RandomUUID UDF
Posted by mh...@apache.org.
DATAFU-18: Create datafu.util.RandomUUID UDF
https://issues.apache.org/jira/browse/DATAFU-18
Signed-off-by: Matt Hayes <mh...@linkedin.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/90c754e0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/90c754e0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/90c754e0
Branch: refs/heads/master
Commit: 90c754e08c997f12f0c794fc67cfa3f354793a2b
Parents: 41a0c2c
Author: Russell Jurney <ru...@gmail.com>
Authored: Wed Jan 29 15:09:21 2014 -0800
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Wed Jan 29 15:09:25 2014 -0800
----------------------------------------------------------------------
src/java/datafu/pig/random/RandomUUID.java | 46 +++++++++++++
test/pig/datafu/test/pig/random/UUIDTests.java | 73 +++++++++++++++++++++
2 files changed, 119 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/90c754e0/src/java/datafu/pig/random/RandomUUID.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/random/RandomUUID.java b/src/java/datafu/pig/random/RandomUUID.java
new file mode 100644
index 0000000..d63f4cf
--- /dev/null
+++ b/src/java/datafu/pig/random/RandomUUID.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.random;
+
+import java.io.IOException;
+import java.util.UUID;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.builtin.Nondeterministic;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * Generates a random UUID using java.util.UUID
+ */
+@Nondeterministic
+public class RandomUUID extends EvalFunc<String>
+{
+ public String exec(Tuple input) throws IOException
+ {
+ return UUID.randomUUID().toString();
+ }
+
+ @Override
+ public Schema outputSchema(Schema input)
+ {
+ return new Schema(new Schema.FieldSchema("uuid", DataType.CHARARRAY));
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/90c754e0/test/pig/datafu/test/pig/random/UUIDTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/random/UUIDTests.java b/test/pig/datafu/test/pig/random/UUIDTests.java
new file mode 100644
index 0000000..e199760
--- /dev/null
+++ b/test/pig/datafu/test/pig/random/UUIDTests.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.test.pig.random;
+
+import datafu.test.pig.PigTests;
+import junit.framework.Assert;
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import java.util.*;
+
+import static org.testng.Assert.assertTrue;
+
+public class UUIDTests extends PigTests
+{
+ /**
+ register $JAR_PATH
+
+ define RandomUUID datafu.pig.random.RandomUUID();
+
+ data = LOAD 'input' AS (key: chararray);
+ DUMP data
+
+ data2 = FOREACH data GENERATE key, RandomUUID() as val;
+ DUMP data2
+
+ STORE data2 INTO 'output';
+ */
+ @Multiline private String randomUUIDTest;
+
+ /**
+ * Test the RandomUUID UDF. The main purpose is to make sure it can be used in a Pig script.
+ * Also the range of length of values is tested.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void randomUUIDTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(randomUUIDTest);
+
+ writeLinesToFile("input",
+ "input1",
+ "input2",
+ "input3");
+
+ List<Tuple> tuples = getLinesForAlias(test, "data2", true);
+ Set<UUID> set = new HashSet<UUID>();
+ for (Tuple tuple : tuples)
+ {
+ set.add(UUID.fromString((String)tuple.get(1)));
+ }
+ Assert.assertEquals(set.size(), 3);
+ }
+}