You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by ey...@apache.org on 2017/09/14 19:33:38 UTC

incubator-datafu git commit: DATAFU-61 Add TF-IDF Macro to DataFu

Repository: incubator-datafu
Updated Branches:
  refs/heads/master d92e84395 -> 4f8c4e85b


DATAFU-61 Add TF-IDF Macro to DataFu

Signed-off-by: Eyal Allweil <ey...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/4f8c4e85
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/4f8c4e85
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/4f8c4e85

Branch: refs/heads/master
Commit: 4f8c4e85bfd2c415a335a97c1b495ee596512061
Parents: d92e843
Author: Russell Jurney <ru...@gmail.com>
Authored: Sun Aug 6 16:50:40 2017 +0300
Committer: Eyal Allweil <ey...@apache.org>
Committed: Thu Sep 14 22:32:43 2017 +0300

----------------------------------------------------------------------
 datafu-pig/src/main/resources/datafu/tf_idf.pig | 94 ++++++++++++++++++++
 .../java/datafu/test/pig/stats/TFIDFTests.java  | 64 +++++++++++++
 2 files changed, 158 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4f8c4e85/datafu-pig/src/main/resources/datafu/tf_idf.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/tf_idf.pig b/datafu-pig/src/main/resources/datafu/tf_idf.pig
new file mode 100644
index 0000000..5ee0a1b
--- /dev/null
+++ b/datafu-pig/src/main/resources/datafu/tf_idf.pig
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Given a set of documents, returns tf-idf feature vectors for those documents.
+ *
+ * documents:   { id, text:chararray }  Document set.
+ * maxFeatures: int                     Maximum number of features to return per document
+ * ==>
+ * vectors: { id, features:{(token:chararray, weight:float)} } Ordered by weight desc.
+ */
+define DataFu_NlpTFIDF(documents, maxFeatures) returns vectors {
+
+  define TokenizeSimple datafu.pig.text.opennlp.TokenizeSimple();
+
+  --
+  -- Get corpus size first
+  --
+  uniq     = distinct (foreach $documents generate id);
+  num_docs = foreach (group uniq all) generate COUNT(uniq) as N; -- ugh.
+
+  --
+  -- Tokenize the documents
+  --
+  tokenized = foreach $documents generate
+                id,
+                flatten(TokenizeSimple(text)) as (token:chararray);
+
+  --
+  -- Next, get raw term frequencies. Combiners will be made use of here to reduce some of the
+  -- token explosion
+  --
+  term_freqs = foreach (group tokenized by (id, token)) generate
+                 flatten(group)   as (id, token),
+                 COUNT(tokenized) as term_freq;
+
+  --
+  -- Now, compute the 'augmented' frequency to prevent bias toward long docs
+  --
+  max_term_freqs = foreach (group term_freqs by id) generate
+                     flatten(term_freqs)       as (id, token, term_freq),
+                     MAX(term_freqs.term_freq) as max_term_freq;
+
+  aug_term_freqs = foreach max_term_freqs {
+                     -- see: http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
+                     aug_freq = 0.5f + (0.5f * term_freq)/max_term_freq;
+                     generate
+                       id       as id,
+                       token    as token,
+                       aug_freq as term_freq;
+                    };
+
+  --
+  -- Next, get document frequency; how many documents does a term appear in.
+  --
+  doc_freqs = foreach (group aug_term_freqs by token) {
+                raw_doc_freq = COUNT(aug_term_freqs);
+                idf          = LOG((float)num_docs.N/(float)raw_doc_freq);
+                generate
+                  flatten(aug_term_freqs) as (id, token, term_freq),
+                  idf                     as idf;
+              };
+
+  --
+  -- Finally, compute tf-idf
+  --
+  weights = foreach doc_freqs generate
+              id            as id,
+              token         as token,
+              term_freq*idf as weight;
+
+  $vectors = foreach (group weights by id) {
+               ordered = order weights by weight desc;
+               top_N   = limit ordered $maxFeatures; -- use this instead of top to maintain ordering
+               generate
+                 group                as id,
+                 top_N.(token,weight) as features;
+             };
+};
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4f8c4e85/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java
new file mode 100644
index 0000000..9b46ff4
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/TFIDFTests.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+public class TFIDFTests extends PigTests
+
+{
+  /**
+
+  import 'datafu/tf_idf.pig';
+
+  raw_documents = LOAD 'input' AS (id:chararray, text:chararray);
+
+  -- Compute topics per document via Macro
+  vectors = DataFu_NlpTFIDF(raw_documents, 100);
+
+  STORE vectors INTO 'output';
+
+   */
+  @Multiline
+  private String tfidfTest;
+
+  @Test
+  public void simpleTFIDFTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(tfidfTest);
+
+    writeLinesToFile(	"input",
+						"text\tthis is a a sample",
+						"text2\tthis is another another example example example");
+
+    test.runScript();
+
+    String expected[] = {
+    		"(text,{(a,0.6931471805599453),(sample,0.5198603854199589),(this,0.0),(is,0.0)})",
+    		"(text2,{(example,0.6931471805599453),(another,0.5776226780098154),(this,0.0),(is,0.0)})"};
+    
+    assertOutput(test, "vectors", expected);
+  }
+
+}