You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by ey...@apache.org on 2017/09/14 19:33:38 UTC
incubator-datafu git commit: DATAFU-61 Add TF-IDF Macro to DataFu
Repository: incubator-datafu
Updated Branches:
refs/heads/master d92e84395 -> 4f8c4e85b
DATAFU-61 Add TF-IDF Macro to DataFu
Signed-off-by: Eyal Allweil <ey...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/4f8c4e85
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/4f8c4e85
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/4f8c4e85
Branch: refs/heads/master
Commit: 4f8c4e85bfd2c415a335a97c1b495ee596512061
Parents: d92e843
Author: Russell Jurney <ru...@gmail.com>
Authored: Sun Aug 6 16:50:40 2017 +0300
Committer: Eyal Allweil <ey...@apache.org>
Committed: Thu Sep 14 22:32:43 2017 +0300
----------------------------------------------------------------------
datafu-pig/src/main/resources/datafu/tf_idf.pig | 94 ++++++++++++++++++++
.../java/datafu/test/pig/stats/TFIDFTests.java | 64 +++++++++++++
2 files changed, 158 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4f8c4e85/datafu-pig/src/main/resources/datafu/tf_idf.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/tf_idf.pig b/datafu-pig/src/main/resources/datafu/tf_idf.pig
new file mode 100644
index 0000000..5ee0a1b
--- /dev/null
+++ b/datafu-pig/src/main/resources/datafu/tf_idf.pig
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Given a set of documents, returns tf-idf feature vectors for those documents.
+ *
+ * documents: { id, text:chararray } Document set.
+ * maxFeatures: int Maximum number of features to return per document
+ * ==>
+ * vectors: { id, features:{(token:chararray, weight:float)} } Ordered by weight desc.
+ */
+define DataFu_NlpTFIDF(documents, maxFeatures) returns vectors {
+
+ define TokenizeSimple datafu.pig.text.opennlp.TokenizeSimple();
+
+ --
+ -- Get corpus size first
+ --
+ uniq = distinct (foreach $documents generate id);
+ num_docs = foreach (group uniq all) generate COUNT(uniq) as N; -- ugh.
+
+ --
+ -- Tokenize the documents
+ --
+ tokenized = foreach $documents generate
+ id,
+ flatten(TokenizeSimple(text)) as (token:chararray);
+
+ --
+ -- Next, get raw term frequencies. Combiners will be made use of here to reduce some of the
+ -- token explosion
+ --
+ term_freqs = foreach (group tokenized by (id, token)) generate
+ flatten(group) as (id, token),
+ COUNT(tokenized) as term_freq;
+
+ --
+ -- Now, compute the 'augmented' frequency to prevent bias toward long docs
+ --
+ max_term_freqs = foreach (group term_freqs by id) generate
+ flatten(term_freqs) as (id, token, term_freq),
+ MAX(term_freqs.term_freq) as max_term_freq;
+
+ aug_term_freqs = foreach max_term_freqs {
+ -- see: http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
+ aug_freq = 0.5f + (0.5f * term_freq)/max_term_freq;
+ generate
+ id as id,
+ token as token,
+ aug_freq as term_freq;
+ };
+
+ --
+ -- Next, get document frequency; how many documents does a term appear in.
+ --
+ doc_freqs = foreach (group aug_term_freqs by token) {
+ raw_doc_freq = COUNT(aug_term_freqs);
+ idf = LOG((float)num_docs.N/(float)raw_doc_freq);
+ generate
+ flatten(aug_term_freqs) as (id, token, term_freq),
+ idf as idf;
+ };
+
+ --
+ -- Finally, compute tf-idf
+ --
+ weights = foreach doc_freqs generate
+ id as id,
+ token as token,
+ term_freq*idf as weight;
+
+ $vectors = foreach (group weights by id) {
+ ordered = order weights by weight desc;
+ top_N = limit ordered $maxFeatures; -- use this instead of top to maintain ordering
+ generate
+ group as id,
+ top_N.(token,weight) as features;
+ };
+};
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4f8c4e85/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java
new file mode 100644
index 0000000..9b46ff4
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+public class TFIDFTests extends PigTests
+
+{
+ /**
+
+ import 'datafu/tf_idf.pig';
+
+ raw_documents = LOAD 'input' AS (id:chararray, text:chararray);
+
+ -- Compute topics per document via Macro
+ vectors = DataFu_NlpTFIDF(raw_documents, 100);
+
+ STORE vectors INTO 'output';
+
+ */
+ @Multiline
+ private String tfidfTest;
+
+ @Test
+ public void simpleTFIDFTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(tfidfTest);
+
+ writeLinesToFile( "input",
+ "text\tthis is a a sample",
+ "text2\tthis is another another example example example");
+
+ test.runScript();
+
+ String expected[] = {
+ "(text,{(a,0.6931471805599453),(sample,0.5198603854199589),(this,0.0),(is,0.0)})",
+ "(text2,{(example,0.6931471805599453),(another,0.5776226780098154),(this,0.0),(is,0.0)})"};
+
+ assertOutput(test, "vectors", expected);
+ }
+
+}