You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@mahout.apache.org by ak...@apache.org on 2015/03/30 18:34:02 UTC

[1/3] mahout git commit: extended TFPartialVectorReducer.java to handle multiple text blocks of one document

Repository: mahout
Updated Branches:
  refs/heads/master 4b1c13332 -> 91c1626df


extended TFPartialVectorReducer.java to handle multiple text blocks of one document


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/ccaec1b2
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/ccaec1b2
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/ccaec1b2

Branch: refs/heads/master
Commit: ccaec1b267d768e890695dd36c49a34d621e1e73
Parents: c9d978a
Author: wobu <bu...@googlemail.com>
Authored: Fri Jul 25 13:01:10 2014 +0200
Committer: wobu <bu...@googlemail.com>
Committed: Fri Jul 25 13:01:10 2014 +0200

----------------------------------------------------------------------
 .../vectorizer/term/TFPartialVectorReducer.java | 16 ++++--
 .../vectorizer/DictionaryVectorizerTest.java    | 60 +++++++++++++++++---
 2 files changed, 65 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/ccaec1b2/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
index e8b24e1..53246ef 100644
--- a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
+++ b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.vectorizer.term;
 
+import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
@@ -44,6 +45,7 @@ import org.apache.mahout.vectorizer.common.PartialVectorMerger;
 import java.io.IOException;
 import java.net.URI;
 import java.util.Iterator;
+import java.util.List;
 
 /**
  * Converts a document in to a sparse vector
@@ -61,15 +63,21 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec
   protected void reduce(Text key, Iterable<StringTuple> values, Context context)
     throws IOException, InterruptedException {
     Iterator<StringTuple> it = values.iterator();
+
     if (!it.hasNext()) {
       return;
     }
-    StringTuple value = it.next();
 
-    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size
+    List<String> value = Lists.newArrayList();
+
+    while (it.hasNext()) {
+      value.addAll(it.next().getEntries());
+    }
+
+    Vector vector = new RandomAccessSparseVector(dimension, value.size()); // guess at initial size
 
     if (maxNGramSize >= 2) {
-      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);
+      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.iterator()), maxNGramSize);
       sf.reset();
       try {
         do {
@@ -85,7 +93,7 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec
         Closeables.close(sf, true);
       }
     } else {
-      for (String term : value.getEntries()) {
+      for (String term : value) {
         if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
           int termId = dictionary.get(term);
           vector.setQuick(termId, vector.getQuick(termId) + 1);

http://git-wip-us.apache.org/repos/asf/mahout/blob/ccaec1b2/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java b/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
index edcc79b..835854f 100644
--- a/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
+++ b/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
@@ -18,14 +18,17 @@
 package org.apache.mahout.vectorizer;
 
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
 import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.lucene.analysis.Analyzer;
@@ -34,6 +37,7 @@ import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
 import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.RandomAccessSparseVector;
@@ -51,6 +55,7 @@ import org.junit.Test;
 public final class DictionaryVectorizerTest extends MahoutTestCase {
 
   private static final int NUM_DOCS = 100;
+  private static final String SECOND_TEXT_BLOCK_IDENTIFIER = "2NDBLOCK";
 
   private Path inputPath;
   
@@ -69,13 +74,18 @@ public final class DictionaryVectorizerTest extends MahoutTestCase {
       RandomDocumentGenerator gen = new RandomDocumentGenerator();
 
       for (int i = 0; i < NUM_DOCS; i++) {
-        writer.append(new Text("Document::ID::" + i), new Text(gen.getRandomDocument()));
+        writer.append(
+                new Text("Document::ID::" + i),
+                new Text(gen.getRandomDocument()));
+        writer.append(
+                new Text("Document::ID::" + i),
+                new Text(SECOND_TEXT_BLOCK_IDENTIFIER));
       }
     } finally {
       Closeables.close(writer, false);
     }
   }
-  
+
   @Test
   public void testCreateTermFrequencyVectors() throws Exception {
     runTest(false, false);
@@ -85,7 +95,7 @@ public final class DictionaryVectorizerTest extends MahoutTestCase {
   public void testCreateTermFrequencyVectorsNam() throws Exception {
     runTest(false, true);
   }
-  
+
   @Test
   public void testCreateTermFrequencyVectorsSeq() throws Exception {
     runTest(true, false);
@@ -106,7 +116,7 @@ public final class DictionaryVectorizerTest extends MahoutTestCase {
     Path tfVectors = new Path(wordCount, "tf-vectors");
     Path tfidf = getTestTempDirPath("output/tfidf");
     Path tfidfVectors = new Path(tfidf, "tfidf-vectors");
-    
+
     Configuration conf = getConfiguration();
     DocumentProcessor.tokenizeDocuments(inputPath, analyzer, tokenizedDocuments, conf);
     
@@ -123,7 +133,7 @@ public final class DictionaryVectorizerTest extends MahoutTestCase {
                                                     100,
                                                     sequential,
                                                     named);
-    
+
     validateVectors(conf, NUM_DOCS, tfVectors, sequential, named);
     
     Pair<Long[], List<Path>> docFrequenciesFeatures = TFIDFConverter.calculateDF(tfVectors, 
@@ -143,6 +153,10 @@ public final class DictionaryVectorizerTest extends MahoutTestCase {
     
     
     validateVectors(conf, NUM_DOCS, tfidfVectors, sequential, named);
+
+    Integer secondTextBlockIdentifierDimensionId = validateDictionary(wordCount, conf);
+
+    validateVectorContainingSecondTextBlock(conf, tfVectors, secondTextBlockIdentifierDimensionId);
   }
   
   public static void validateVectors(Configuration conf,
@@ -166,9 +180,41 @@ public final class DictionaryVectorizerTest extends MahoutTestCase {
       } else {
         assertTrue("Expected RandomAccessSparseVector", v instanceof RandomAccessSparseVector);
       }
+    }
+
+    assertEquals("Expected " + numDocs + " documents", numDocs, count);
+  }
+
+  private Integer validateDictionary(Path dictionaryDirectoryPath, Configuration conf) {
+    PathFilter dictionaryChunkPathFilter = new PathFilter() {
+      @Override
+      public boolean accept(Path path) {
+        String name = path.getName();
+        return name.startsWith("dictionary.file");
+      }
+    };
+
+    Map<String, Integer> dictionary = new HashMap<String, Integer>();
 
+    for (Pair<Text, IntWritable> value :
+            new SequenceFileDirIterable<Text, IntWritable>(
+                    dictionaryDirectoryPath, PathType.LIST, dictionaryChunkPathFilter, null, true, conf)) {
+      dictionary.put(value.getFirst().toString(), value.getSecond().get());
     }
 
-  assertEquals("Expected " + numDocs + " documents", numDocs, count);
+    Integer secondTextBlockIdentifierDimensionId = dictionary.get(SECOND_TEXT_BLOCK_IDENTIFIER.toLowerCase());
+
+    assertNotNull("Token '" + SECOND_TEXT_BLOCK_IDENTIFIER + "' must be in dictionary ", secondTextBlockIdentifierDimensionId);
+    assertTrue("Dictionary must contain more than just 1 element!", dictionary.size() > 1);
+
+    return secondTextBlockIdentifierDimensionId;
+  }
+
+  public static void validateVectorContainingSecondTextBlock(Configuration conf, Path vectorPath, int dimensionId) {
+    for (VectorWritable value :
+            new SequenceFileDirValueIterable<VectorWritable>(
+                    vectorPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
+      assertTrue("The vector must contain the second text block", value.get().get(dimensionId) > 0);
+    }
   }
 }

[2/3] mahout git commit: Merge branch 'mahout-1598' of https://github.com/wobu/mahout

Posted by ak...@apache.org.

Merge branch 'mahout-1598' of https://github.com/wobu/mahout


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/1f17d23f
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/1f17d23f
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/1f17d23f

Branch: refs/heads/master
Commit: 1f17d23f6095e67a2f8d188f4817e286cf916e98
Parents: 4b1c133 ccaec1b
Author: Andrew Musselman <ak...@apache.org>
Authored: Mon Mar 30 08:46:53 2015 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Mon Mar 30 08:46:53 2015 -0700

----------------------------------------------------------------------
 .../vectorizer/term/TFPartialVectorReducer.java | 16 ++++--
 .../vectorizer/DictionaryVectorizerTest.java    | 60 +++++++++++++++++---
 2 files changed, 65 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/1f17d23f/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
----------------------------------------------------------------------

[3/3] mahout git commit: This closes #34

Posted by ak...@apache.org.

This closes #34


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/91c1626d
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/91c1626d
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/91c1626d

Branch: refs/heads/master
Commit: 91c1626df337900051ead7019f70bd15fe93f6a7
Parents: 1f17d23
Author: Andrew Musselman <ak...@apache.org>
Authored: Mon Mar 30 09:33:35 2015 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Mon Mar 30 09:33:35 2015 -0700

----------------------------------------------------------------------
 CHANGELOG | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/91c1626d/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 7b47a0f..2660638 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.10.0 - unreleased
 
+  MAHOUT-1598: extend seq2sparse to handle multiple text blocks of same document (Wolfgang Buchnere via akm)
+
   MAHOUT-1659: Remove deprecated Lanczos solver from spectral clustering in mr-legacy (Shannon Quinn)
 
   MAHOUT-1612: NullPointerException happens during JSON output format for clusterdumper (smarthi, Manoj Awasthi)