You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2013/06/06 23:57:50 UTC
svn commit: r1490457 - in /mahout/trunk: ./ integration/src/main/java/org/apache/mahout/text/ integration/src/test/java/org/apache/mahout/text/

Author: gsingers
Date: Thu Jun  6 21:57:50 2013
New Revision: 1490457

URL: http://svn.apache.org/r1490457
Log:
MAHOUT-944: fix the things that should have been committed the first time

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Thu Jun  6 21:57:50 2013
@@ -85,4 +85,6 @@ __MAHOUT-1181: Adding StreamingKMeans Ma
 
   MAHOUT-1108: Allows cluster-reuters.sh example to be executed on a cluster (elmer.garduno via gsingers) 
 
-  MAHOUT-961: Fix issue in decision forest tree visualizer to properly show stems of tree (Ikumasa Mukai via gsingers)
\ No newline at end of file
+  MAHOUT-961: Fix issue in decision forest tree visualizer to properly show stems of tree (Ikumasa Mukai via gsingers)
+
+  MAHOUT-944: Create SequenceFiles out of Lucene document storage (no term vectors required) (Frank Scholten, gsingers)
\ No newline at end of file

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java Thu Jun  6 21:57:50 2013
@@ -1,11 +1,11 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
+package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-package org.apache.mahout.text;
-
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
-import org.apache.lucene.index.IndexFileNameFilter;
 import org.apache.lucene.index.IndexFileNames;
 
+import java.util.regex.Pattern;
+
 /**
  * A wrapper class to convert an IndexFileNameFilter which implements
  * java.io.FilenameFilter to an org.apache.hadoop.fs.PathFilter.
@@ -41,18 +40,26 @@ class LuceneIndexFileNameFilter implemen
     return singleton;
   }
 
-  //nocommit Not sure what the alternative is here
-  private final IndexFileNameFilter luceneFilter;
-
   private LuceneIndexFileNameFilter() {
-    luceneFilter = IndexFileNames.getFilter();
   }
 
+  //TODO: Lucene defines this in IndexFileNames, but it is package private, so make sure it doesn't change w/ new releases.
+  private static final Pattern CODEC_FILE_PATTERN = Pattern.compile("_[a-z0-9]+(_.*)?\\..*");
+
   /* (non-Javadoc)
   * @see org.apache.hadoop.fs.PathFilter#accept(org.apache.hadoop.fs.Path)
   */
   public boolean accept(Path path) {
-    return luceneFilter.accept(null, path.getName());
+    String name = path.getName();
+    if (CODEC_FILE_PATTERN.matcher(name).matches() || name.startsWith(IndexFileNames.SEGMENTS)) {
+      return true;
+    }
+    for (String extension : IndexFileNames.INDEX_EXTENSIONS) {
+      if (name.endsWith(extension)) {
+        return true;
+      }
+    }
+    return false;
   }
 
 }

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java Thu Jun  6 21:57:50 2013
@@ -1,4 +1,20 @@
 package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java Thu Jun  6 21:57:50 2013
@@ -1,4 +1,20 @@
 package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.NullWritable;
@@ -39,12 +55,12 @@ public class LuceneSegmentRecordReader e
     LuceneStorageConfiguration lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration);
 
     SegmentInfoPerCommit segmentInfo = inputSplit.getSegment(configuration);
-    segmentReader = new SegmentReader(segmentInfo, USE_TERM_INFOS, IOContext.READ);//nocommit: Should we use IOContext.READONCE?
+    segmentReader = new SegmentReader(segmentInfo, USE_TERM_INFOS, IOContext.READ);
 
 
     searcher = new IndexSearcher(segmentReader);
     Weight weight = lucene2SeqConfiguration.getQuery().createWeight(searcher);
-    scorer = weight.scorer(segmentReader.getContext(), true, false, null);//nocommit
+    scorer = weight.scorer(segmentReader.getContext(), false, false, null);
   }
 
   @Override
@@ -67,7 +83,7 @@ public class LuceneSegmentRecordReader e
 
   @Override
   public float getProgress() throws IOException, InterruptedException {
-    return 0;
+    return scorer.cost() == 0 ? 0 : (float) nextDocId / scorer.cost();//this is a rough estimate, due to the possible inaccuracies of cost
   }
 
   @Override

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java Thu Jun  6 21:57:50 2013
@@ -1,4 +1,5 @@
-/**
+package org.apache.mahout.text;
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -15,13 +16,12 @@
  * limitations under the License.
  */
 
-package org.apache.mahout.text;
-
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Sets;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.io.DefaultStringifier;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
@@ -32,6 +32,7 @@ import org.apache.lucene.queryparser.cla
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
 
@@ -144,7 +145,7 @@ public class LuceneStorageConfiguration 
    * @return iterator
    */
   public Iterator<Pair<Text, Text>> getSequenceFileIterator() {
-    return new SequenceFileDirIterable<Text, Text>(sequenceFilesOutputPath, PathType.LIST, configuration).iterator();
+    return new SequenceFileDirIterable<Text, Text>(sequenceFilesOutputPath, PathType.LIST, PathFilters.logsCRCFilter(), configuration).iterator();
   }
 
   public Configuration getConfiguration() {

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java Thu Jun  6 21:57:50 2013
@@ -35,10 +35,10 @@ import org.apache.lucene.store.Lock;
 import java.io.IOException;
 import java.util.Collection;
 
-//NOCOMMIT: Not sure if there isn't a better way of doing this in 4.x.  Don't we have a Hadoop Directory impl somewhere?
+//TODO: is there a better way of doing this in Lucene 4.x?
 
 /**
- * This class implements a Lucene Directory on top of a general FileSystem.
+ * This class implements a read-only Lucene Directory on top of a general FileSystem.
  * Currently it does not support locking.
  * <p/>
  * // TODO: Rename to FileSystemReadOnlyDirectory
@@ -105,10 +105,10 @@ public class ReadOnlyFileSystemDirectory
     // clear old index files
     FileStatus[] fileStatus =
             fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
-    for (int i = 0; i < fileStatus.length; i++) {
-      if (!fs.delete(fileStatus[i].getPath(), true)) {
+    for (FileStatus status : fileStatus) {
+      if (!fs.delete(status.getPath(), true)) {
         throw new IOException("Cannot delete index file "
-                + fileStatus[i].getPath());
+                + status.getPath());
       }
     }
   }
@@ -177,7 +177,7 @@ public class ReadOnlyFileSystemDirectory
 
   @Override
   public IndexOutput createOutput(String name, IOContext context) throws IOException {
-    //nocommit What should we be doing with the IOContext here?
+    //TODO: What should we be doing with the IOContext here, if anything?
     Path file = new Path(directory, name);
     if (fs.exists(file) && !fs.delete(file, true)) {
       // delete the existing one if applicable
@@ -189,7 +189,7 @@ public class ReadOnlyFileSystemDirectory
 
   @Override
   public void sync(Collection<String> names) throws IOException {
-
+    // do nothing, as this is read-only
   }
 
   @Override

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java Thu Jun  6 21:57:50 2013
@@ -1,4 +1,5 @@
-/**
+package org.apache.mahout.text;
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -15,8 +16,6 @@
  * limitations under the License.
  */
 
-package org.apache.mahout.text;
-
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -29,6 +28,7 @@ import org.apache.lucene.index.AtomicRea
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.Weight;
@@ -50,9 +50,6 @@ import static org.apache.lucene.search.D
  * Configure this class with a {@link LuceneStorageConfiguration} bean.
  */
 public class SequenceFilesFromLuceneStorage {
-
-  public static final String SEPARATOR_FIELDS = " ";
-
   private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromLuceneStorage.class);
 
   /**
@@ -61,75 +58,75 @@ public class SequenceFilesFromLuceneStor
    * @param lucene2seqConf configuration bean
    * @throws java.io.IOException if index cannot be opened or sequence file could not be written
    */
-  public void run(LuceneStorageConfiguration lucene2seqConf) throws IOException {
+  public void run(final LuceneStorageConfiguration lucene2seqConf) throws IOException {
     List<Path> indexPaths = lucene2seqConf.getIndexPaths();
+    int processedDocs = 0;
 
     for (Path indexPath : indexPaths) {
-      Directory directory = FSDirectory.open(new File(indexPath.toString()));
+      Directory directory = FSDirectory.open(new File(indexPath.toUri().getPath()));
       IndexReader reader = DirectoryReader.open(directory);
       IndexSearcher searcher = new IndexSearcher(reader);
       Configuration configuration = lucene2seqConf.getConfiguration();
       FileSystem fileSystem = FileSystem.get(configuration);
-      Path sequenceFilePath = new Path(lucene2seqConf.getSequenceFilesOutputPath(), indexPath);
-      SequenceFile.Writer sequenceFileWriter = new SequenceFile.Writer(fileSystem, configuration, sequenceFilePath, Text.class, Text.class);
+      Path sequenceFilePath = new Path(lucene2seqConf.getSequenceFilesOutputPath(), indexPath.getName());
+      final SequenceFile.Writer sequenceFileWriter = new SequenceFile.Writer(fileSystem, configuration, sequenceFilePath, Text.class, Text.class);
 
-      Text key = new Text();
-      Text value = new Text();
-
-      Weight weight = lucene2seqConf.getQuery().createWeight(searcher);
-      //TODO: as the name implies, this is slow, but this is sequential anyway, so not a big deal.  Better perf. would be by looping on the segments
-      AtomicReaderContext context = SlowCompositeReaderWrapper.wrap(reader).getContext();
-
-      Scorer scorer = weight.scorer(context, true, false, null);
-
-      if (scorer != null) {
-        int processedDocs = 0;
-        int docId;
-
-        while ((docId = scorer.nextDoc()) != NO_MORE_DOCS && processedDocs < lucene2seqConf.getMaxHits()) {
-          DocumentStoredFieldVisitor storedFieldVisitor = lucene2seqConf.getStoredFieldVisitor();
-          reader.document(docId, storedFieldVisitor);
-          Document doc = storedFieldVisitor.getDocument();
-          String idValue = doc.get(lucene2seqConf.getIdField());
-
-          StringBuilder fieldValueBuilder = new StringBuilder();
-          List<String> fields = lucene2seqConf.getFields();
-          for (int i = 0; i < fields.size(); i++) {
-            String field = fields.get(i);
-            String fieldValue = doc.get(field);
-            if (isNotBlank(fieldValue)) {
-              fieldValueBuilder.append(fieldValue);
-              if (i != fields.size() - 1) {
-                fieldValueBuilder.append(SEPARATOR_FIELDS);
-              }
-            }
-          }
-
-          if (isBlank(idValue) || isBlank(fieldValueBuilder.toString())) {
-            continue;
-          }
+      SeqFileWriterCollector writerCollector = new SeqFileWriterCollector(lucene2seqConf, sequenceFileWriter, processedDocs);
+      searcher.search(lucene2seqConf.getQuery(), writerCollector);
+      log.info("Wrote " + writerCollector.processedDocs + " documents in " + sequenceFilePath.toUri());
+      processedDocs = writerCollector.processedDocs;
+      Closeables.close(sequenceFileWriter, true);
+      directory.close();
+      //searcher.close();
+      reader.close();
+    }
+  }
 
-          key.set(idValue);
-          value.set(fieldValueBuilder.toString());
+  private static class SeqFileWriterCollector extends Collector {
+    private final LuceneStorageConfiguration lucene2seqConf;
+    private final SequenceFile.Writer sequenceFileWriter;
+    public int processedDocs;
+    AtomicReaderContext arc;
+
+    SeqFileWriterCollector(LuceneStorageConfiguration lucene2seqConf, SequenceFile.Writer sequenceFileWriter, int processedDocs) {
+      this.lucene2seqConf = lucene2seqConf;
+      this.sequenceFileWriter = sequenceFileWriter;
+      this.processedDocs = processedDocs;
+    }
 
-          sequenceFileWriter.append(key, value);
+    @Override
+    public void setScorer(Scorer scorer) throws IOException {
+      //don't care about scoring, we just want the matches
+    }
 
-          processedDocs++;
+    @Override
+    public void collect(int docNum) throws IOException {
+      if (processedDocs < lucene2seqConf.getMaxHits()) {
+        final DocumentStoredFieldVisitor storedFieldVisitor = lucene2seqConf.getStoredFieldVisitor();
+        arc.reader().document(docNum, storedFieldVisitor);
+
+        Document doc = storedFieldVisitor.getDocument();
+        List<String> fields = lucene2seqConf.getFields();
+        Text theKey = new Text(LuceneSeqFileHelper.nullSafe(doc.get(lucene2seqConf.getIdField())));
+        Text theValue = new Text();
+        LuceneSeqFileHelper.populateValues(doc, theValue, fields);
+        //if they are both empty, don't write
+        if (isBlank(theKey.toString()) && isBlank(theValue.toString())) {
+          return;
         }
-
-        log.info("Wrote " + processedDocs + " documents in " + sequenceFilePath.toUri());
-      } else {
-        Closeables.close(sequenceFileWriter, true);
-        directory.close();
-        //searcher.close();
-        reader.close();
-        throw new RuntimeException("Could not write sequence files. Could not create scorer");
+        sequenceFileWriter.append(theKey, theValue);
+        processedDocs++;
       }
+    }
 
-      Closeables.close(sequenceFileWriter, true);
-      directory.close();
-      //searcher.close();
-      reader.close();
+    @Override
+    public void setNextReader(AtomicReaderContext context) throws IOException {
+      arc = context;
+    }
+
+    @Override
+    public boolean acceptsDocsOutOfOrder() {
+      return true;
     }
   }
 }

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java Thu Jun  6 21:57:50 2013
@@ -1,4 +1,5 @@
-/**
+package org.apache.mahout.text;
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -15,8 +16,6 @@
  * limitations under the License.
  */
 
-package org.apache.mahout.text;
-
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
@@ -32,6 +31,7 @@ import org.apache.mahout.common.commandl
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import static java.util.Arrays.asList;
 
@@ -42,7 +42,6 @@ import static java.util.Arrays.asList;
  */
 public class SequenceFilesFromLuceneStorageDriver extends AbstractJob {
 
-  static final String OPTION_LUCENE_DIRECTORY = "dir";
   static final String OPTION_ID_FIELD = "idField";
   static final String OPTION_FIELD = "fields";
   static final String OPTION_QUERY = "query";
@@ -53,6 +52,7 @@ public class SequenceFilesFromLuceneStor
 
   static final String SEPARATOR_FIELDS = ",";
   static final String QUERY_DELIMITER = "'";
+  private static final Pattern COMPILE = Pattern.compile(QUERY_DELIMITER);
 
   public static void main(String[] args) throws Exception {
     ToolRunner.run(new SequenceFilesFromLuceneStorageDriver(), args);
@@ -61,9 +61,9 @@ public class SequenceFilesFromLuceneStor
   @Override
   public int run(String[] args) throws Exception {
     addOutputOption();
-
-    addOption(OPTION_LUCENE_DIRECTORY, "d", "Lucene directory / directories. Comma separated.", true);
-    addOption(OPTION_ID_FIELD, "i", "The field in the index containing the id", true);
+    addInputOption();
+    //addOption(OPTION_LUCENE_DIRECTORY, "d", "Lucene directory / directories. Comma separated.", true);
+    addOption(OPTION_ID_FIELD, "id", "The field in the index containing the id", true);
     addOption(OPTION_FIELD, "f", "The stored field(s) in the index containing text", true);
 
     addOption(OPTION_QUERY, "q", "(Optional) Lucene query. Defaults to " + DEFAULT_QUERY.getClass().getSimpleName());
@@ -79,13 +79,13 @@ public class SequenceFilesFromLuceneStor
       configuration = new Configuration();
     }
 
-    String[] paths = getOption(OPTION_LUCENE_DIRECTORY).split(",");
+    String[] paths = getInputPath().toString().split(",");
     List<Path> indexPaths = new ArrayList<Path>();
     for (String path : paths) {
       indexPaths.add(new Path(path));
     }
 
-    Path sequenceFilesOutputPath = new Path((getOption(DefaultOptionCreator.OUTPUT_OPTION)));
+    Path sequenceFilesOutputPath = getOutputPath();
 
     String idField = getOption(OPTION_ID_FIELD);
     String fields = getOption(OPTION_FIELD);
@@ -99,8 +99,8 @@ public class SequenceFilesFromLuceneStor
     Query query = DEFAULT_QUERY;
     if (hasOption(OPTION_QUERY)) {
       try {
-        String queryString = getOption(OPTION_QUERY).replaceAll(QUERY_DELIMITER, "");
-        QueryParser queryParser = new QueryParser(Version.LUCENE_35, queryString, new StandardAnalyzer(Version.LUCENE_35));
+        String queryString = COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll("");
+        QueryParser queryParser = new QueryParser(Version.LUCENE_43, queryString, new StandardAnalyzer(Version.LUCENE_43));
         query = queryParser.parse(queryString);
       } catch (ParseException e) {
         throw new IllegalArgumentException(e.getMessage(), e);

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java Thu Jun  6 21:57:50 2013
@@ -1,4 +1,20 @@
 package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java Thu Jun  6 21:57:50 2013
@@ -5,6 +5,7 @@ import org.apache.hadoop.io.NullWritable
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DocumentStoredFieldVisitor;
 import org.apache.lucene.index.SegmentInfoPerCommit;
 import org.apache.lucene.index.SegmentReader;
 import org.apache.lucene.store.IOContext;
@@ -12,7 +13,7 @@ import org.apache.lucene.store.IOContext
 import java.io.IOException;
 import java.util.List;
 
-import static org.apache.commons.lang.StringUtils.isNotBlank;
+import static org.apache.commons.lang.StringUtils.isBlank;
 
 /**
  * Maps document IDs to key value pairs with ID field as the key and the concatenated stored field(s)
@@ -20,66 +21,45 @@ import static org.apache.commons.lang.St
  */
 public class SequenceFilesFromLuceneStorageMapper extends Mapper<Text, NullWritable, Text, Text> {
 
-  public static final String SEPARATOR_FIELDS = " ";
-  public static final int USE_TERM_INFOS = 1;
+  public enum DataStatus {EMPTY_KEY, EMPTY_VALUE, EMPTY_BOTH};
 
-  private LuceneStorageConfiguration lucene2SeqConfiguration;
+  private LuceneStorageConfiguration l2sConf;
   private SegmentReader segmentReader;
 
-  private Text idKey;
-  private Text fieldValue;
-
   @Override
   protected void setup(Context context) throws IOException, InterruptedException {
     Configuration configuration = context.getConfiguration();
-
-    lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration);
-
+    l2sConf = new LuceneStorageConfiguration(configuration);
     LuceneSegmentInputSplit inputSplit = (LuceneSegmentInputSplit) context.getInputSplit();
-
     SegmentInfoPerCommit segmentInfo = inputSplit.getSegment(configuration);
-    segmentReader = new SegmentReader(segmentInfo, USE_TERM_INFOS, IOContext.READ);//nocommit: Should we use IOContext.READONCE?
-
-    idKey = new Text();
-    fieldValue = new Text();
+    segmentReader = new SegmentReader(segmentInfo, LuceneSeqFileHelper.USE_TERM_INFOS, IOContext.READ);
   }
 
   @Override
   protected void map(Text key, NullWritable text, Context context) throws IOException, InterruptedException {
     int docId = Integer.valueOf(key.toString());
-    Document document = segmentReader.document(docId);
-
-    String idString = document.get(lucene2SeqConfiguration.getIdField());
-
-    StringBuilder valueBuilder = new StringBuilder();
-    List<String> fields = lucene2SeqConfiguration.getFields();
-    for (int i = 0; i < fields.size(); i++) {
-      String field = fields.get(i);
-      String fieldValue = document.get(field);
-      if (isNotBlank(fieldValue)) {
-        valueBuilder.append(fieldValue);
-        if (i != fields.size() - 1) {
-          valueBuilder.append(SEPARATOR_FIELDS);
-        }
-      }
+    DocumentStoredFieldVisitor storedFieldVisitor = l2sConf.getStoredFieldVisitor();
+    segmentReader.document(docId, storedFieldVisitor);
+    Document document = storedFieldVisitor.getDocument();
+    List<String> fields = l2sConf.getFields();
+    Text theKey = new Text(LuceneSeqFileHelper.nullSafe(document.get(l2sConf.getIdField())));
+    Text theValue = new Text();
+    LuceneSeqFileHelper.populateValues(document, theValue, fields);
+    //if they are both empty, don't write
+    if (isBlank(theKey.toString()) && isBlank(theValue.toString())) {
+      context.getCounter(DataStatus.EMPTY_BOTH).increment(1);
+      return;
     }
-
-    idKey.set(nullSafe(idString));
-    fieldValue.set(nullSafe(valueBuilder.toString()));
-
-    context.write(idKey, fieldValue);
+    if (isBlank(theKey.toString())){
+      context.getCounter(DataStatus.EMPTY_KEY).increment(1);
+    } else if (isBlank(theValue.toString())){
+      context.getCounter(DataStatus.EMPTY_VALUE).increment(1);
+    }
+    context.write(theKey, theValue);
   }
 
   @Override
   protected void cleanup(Context context) throws IOException, InterruptedException {
     segmentReader.close();
   }
-
-  private String nullSafe(String value) {
-    if (value == null) {
-      return "";
-    } else {
-      return value;
-    }
-  }
 }
\ No newline at end of file

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java Thu Jun  6 21:57:50 2013
@@ -2,32 +2,50 @@ package org.apache.mahout.text;
 
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.text.doc.MultipleFieldsDocument;
 import org.apache.mahout.text.doc.NumericFieldDocument;
 import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
+import org.apache.mahout.utils.MahoutTestCase;
 
 import java.io.File;
 import java.io.IOException;
-
-import static org.junit.Assert.assertEquals;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 /**
  * Abstract test for working with Lucene storage.
  */
-public abstract class AbstractLuceneStorageTest {
+public abstract class AbstractLuceneStorageTest extends MahoutTestCase {
 
-  private Path indexPath = new Path("index");
+  protected Path indexPath1;
+  protected Path indexPath2;
+  protected List<SingleFieldDocument> docs = new ArrayList<SingleFieldDocument>();
+  protected List<SingleFieldDocument> misshapenDocs = new ArrayList<SingleFieldDocument>();
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    indexPath1 = getTestTempDirPath("index1");
+    indexPath2 = getTestTempDirPath("index2");
+    for (int i = 0; i < 2000; i++) {
+      docs.add(new SingleFieldDocument(String.valueOf(i), "This is test document " + i));
+    }
+    misshapenDocs.add(new SingleFieldDocument("", "This doc has an empty id"));
+    misshapenDocs.add(new SingleFieldDocument("empty_value", ""));
+  }
 
-  protected void commitDocuments(SingleFieldDocument... documents) throws IOException {
-    IndexWriter indexWriter = new IndexWriter(getDirectory(), new IndexWriterConfig(Version.LUCENE_35, new DefaultAnalyzer()));
+  protected void commitDocuments(Directory directory, Iterable<SingleFieldDocument> theDocs) throws IOException{
+    IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43)));
 
-    for (SingleFieldDocument singleFieldDocument : documents) {
+    for (SingleFieldDocument singleFieldDocument : theDocs) {
       indexWriter.addDocument(singleFieldDocument.asLuceneDocument());
     }
 
@@ -35,6 +53,10 @@ public abstract class AbstractLuceneStor
     indexWriter.close();
   }
 
+  protected void commitDocuments(Directory directory, SingleFieldDocument... documents) throws IOException {
+    commitDocuments(directory, Arrays.asList(documents));
+  }
+
   protected void assertSimpleDocumentEquals(SingleFieldDocument expected, Pair<Text, Text> actual) {
     assertEquals(expected.getId(), actual.getFirst().toString());
     assertEquals(expected.getField(), actual.getSecond().toString());
@@ -50,11 +72,23 @@ public abstract class AbstractLuceneStor
     assertEquals(expected.getField() + " " + expected.getNumericField(), actual.getSecond().toString());
   }
 
-  protected FSDirectory getDirectory() throws IOException {
-    return FSDirectory.open(new File(indexPath.toString()));
+  protected FSDirectory getDirectory(File indexPath) throws IOException {
+    return FSDirectory.open(indexPath);
+  }
+
+  protected File getIndexPath1AsFile() {
+    return new File(indexPath1.toUri().getPath());
+  }
+
+  protected Path getIndexPath1() {
+    return indexPath1;
+  }
+
+  protected File getIndexPath2AsFile() {
+    return new File(indexPath2.toUri().getPath());
   }
 
-  protected Path getIndexPath() {
-    return indexPath;
+  protected Path getIndexPath2() {
+    return indexPath2;
   }
 }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java Thu Jun  6 21:57:50 2013
@@ -4,47 +4,36 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.JobID;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
 import java.io.IOException;
+import java.util.Collections;
 import java.util.List;
 
-import static java.util.Arrays.asList;
-import static junit.framework.Assert.assertEquals;
-
-public class LuceneSegmentInputFormatTest {
+public class LuceneSegmentInputFormatTest extends AbstractLuceneStorageTest {
 
   private LuceneSegmentInputFormat inputFormat;
   private JobContext jobContext;
-  private Path indexPath;
   private Configuration conf;
-  private FSDirectory directory;
 
   @Before
   public void before() throws IOException {
     inputFormat = new LuceneSegmentInputFormat();
-    indexPath = new Path("index");
-
-    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), asList(indexPath), new Path("output"), "id", asList("field"));
+    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), Collections.singletonList(indexPath1), new Path("output"), "id", Collections.singletonList("field"));
     conf = lucene2SeqConf.serialize();
 
     jobContext = new JobContext(conf, new JobID());
-    directory = FSDirectory.open(new File(indexPath.toString()));
   }
-  
+
   @After
   public void after() throws IOException {
-    HadoopUtil.delete(conf, indexPath);
+    HadoopUtil.delete(conf, indexPath1);
   }
 
   @Test
@@ -52,21 +41,13 @@ public class LuceneSegmentInputFormatTes
     SingleFieldDocument doc1 = new SingleFieldDocument("1", "This is simple document 1");
     SingleFieldDocument doc2 = new SingleFieldDocument("2", "This is simple document 2");
     SingleFieldDocument doc3 = new SingleFieldDocument("3", "This is simple document 3");
-    List<SingleFieldDocument> documents = asList(doc1, doc2, doc3);
 
-    for (SingleFieldDocument singleFieldDocument : documents) {
-      commitDocument(singleFieldDocument);
-    }
+    //generate 3 segments
+    commitDocuments(getDirectory(getIndexPath1AsFile()), doc1);
+    commitDocuments(getDirectory(getIndexPath1AsFile()), doc2);
+    commitDocuments(getDirectory(getIndexPath1AsFile()), doc3);
 
     List<LuceneSegmentInputSplit> splits = inputFormat.getSplits(jobContext);
-    assertEquals(3, splits.size());
-  }
-
-  private void commitDocument(SingleFieldDocument doc) throws IOException {
-    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new DefaultAnalyzer());
-    IndexWriter indexWriter = new IndexWriter(directory, conf);
-    indexWriter.addDocument(doc.asLuceneDocument());
-    indexWriter.commit();
-    indexWriter.close();
+    Assert.assertEquals(3, splits.size());
   }
 }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java Thu Jun  6 21:57:50 2013
@@ -1,43 +1,36 @@
 package org.apache.mahout.text;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentInfoPerCommit;
 import org.apache.lucene.index.SegmentReader;
 import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.store.IOContext;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
 import java.io.IOException;
 import java.util.List;
 
 import static java.util.Arrays.asList;
-import static junit.framework.Assert.assertEquals;
 
-public class LuceneSegmentInputSplitTest {
+public class LuceneSegmentInputSplitTest extends AbstractLuceneStorageTest {
 
   private FSDirectory directory;
-  private Path indexPath;
+
   private Configuration conf;
 
   @Before
   public void before() throws IOException {
-    indexPath = new Path("index");
-    directory = FSDirectory.open(new File(indexPath.toString()));
+    directory = getDirectory(getIndexPath1AsFile());
     conf = new Configuration();
   }
 
   @After
   public void after() throws IOException {
-    HadoopUtil.delete(conf, indexPath);
+    HadoopUtil.delete(conf, indexPath1);
   }
 
   @Test
@@ -48,7 +41,7 @@ public class LuceneSegmentInputSplitTest
 
     List<SingleFieldDocument> docs = asList(doc1, doc2, doc3);
     for (SingleFieldDocument doc : docs) {
-      addDocument(doc);
+      commitDocuments(getDirectory(getIndexPath1AsFile()), doc);
     }
 
     assertSegmentContainsOneDoc("_0");
@@ -64,26 +57,20 @@ public class LuceneSegmentInputSplitTest
 
     List<SingleFieldDocument> docs = asList(doc1, doc2, doc3);
     for (SingleFieldDocument doc : docs) {
-      addDocument(doc);
+      commitDocuments(getDirectory(getIndexPath1AsFile()), doc);
     }
 
-    LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath, "_3", 1000);
+    LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath1, "_3", 1000);
     inputSplit.getSegment(conf);
   }
 
   private void assertSegmentContainsOneDoc(String segmentName) throws IOException {
-    LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath, segmentName, 1000);
-    SegmentInfo segment = inputSplit.getSegment(conf);
-    SegmentReader segmentReader = SegmentReader.get(true, segment, 1);
-    assertEquals(segmentName, segment.name);
+    LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath1, segmentName, 1000);
+    SegmentInfoPerCommit segment = inputSplit.getSegment(conf);
+    SegmentReader segmentReader = new SegmentReader(segment, 1, IOContext.READ);//SegmentReader.get(true, segment, 1);
+    assertEquals(segmentName, segment.info.name);
     assertEquals(1, segmentReader.numDocs());
   }
 
-  private void addDocument(SingleFieldDocument doc) throws IOException {
-    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new DefaultAnalyzer());
-    IndexWriter indexWriter = new IndexWriter(directory, conf);
-    indexWriter.addDocument(doc.asLuceneDocument());
-    indexWriter.commit();
-    indexWriter.close();
-  }
+
 }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java Thu Jun  6 21:57:50 2013
@@ -6,67 +6,53 @@ import org.apache.hadoop.io.NullWritable
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.lucene.index.*;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
+
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
 import java.io.IOException;
-import java.util.List;
 
 import static java.util.Arrays.asList;
 import static org.junit.Assert.assertEquals;
 
 public class LuceneSegmentRecordReaderTest extends AbstractLuceneStorageTest {
-
-  private LuceneSegmentRecordReader recordReader;
   private Configuration configuration;
 
+
   @Before
   public void before() throws IOException, InterruptedException {
-    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), asList(getIndexPath()), new Path("output"), "id", asList("field"));
+    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), asList(getIndexPath1()), new Path("output"), "id", asList("field"));
     configuration = lucene2SeqConf.serialize();
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(500, 1000));
 
-    SingleFieldDocument doc1 = new SingleFieldDocument("1", "This is simple document 1");
-    SingleFieldDocument doc2 = new SingleFieldDocument("2", "This is simple document 2");
-    SingleFieldDocument doc3 = new SingleFieldDocument("3", "This is simple document 3");
-
-    commitDocuments(doc1, doc2, doc3);
-
-    SegmentInfos segmentInfos = new SegmentInfos();
-    segmentInfos.read(getDirectory());
-
-    SegmentInfo segmentInfo = segmentInfos.asList().get(0);
-    LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath(), segmentInfo.name, segmentInfo.sizeInBytes(true));
-
-    TaskAttemptContext context = new TaskAttemptContext(configuration, new TaskAttemptID());
-
-    recordReader = new LuceneSegmentRecordReader();
-    recordReader.initialize(inputSplit, context);
   }
 
   @After
   public void after() throws IOException {
-    HadoopUtil.delete(configuration, getIndexPath());
+    HadoopUtil.delete(configuration, getIndexPath1());
   }
 
   @Test
   public void testKey() throws Exception {
-    recordReader.nextKeyValue();
-    assertEquals("0", recordReader.getCurrentKey().toString());
-    recordReader.nextKeyValue();
-    assertEquals("1", recordReader.getCurrentKey().toString());
-    recordReader.nextKeyValue();
-    assertEquals("2", recordReader.getCurrentKey().toString());
-  }
-
-  @Test
-  public void testGetCurrentValue() throws Exception {
-    assertEquals(NullWritable.get(), recordReader.getCurrentValue());
+    LuceneSegmentRecordReader recordReader = new LuceneSegmentRecordReader();
+    SegmentInfos segmentInfos = new SegmentInfos();
+    segmentInfos.read(getDirectory(getIndexPath1AsFile()));
+    for (SegmentInfoPerCommit segmentInfo : segmentInfos) {
+      int docId = 0;
+      LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath1(), segmentInfo.info.name, segmentInfo.sizeInBytes());
+      TaskAttemptContext context = new TaskAttemptContext(configuration, new TaskAttemptID());
+      recordReader.initialize(inputSplit, context);
+      for (int i = 0; i < 500; i++){
+        recordReader.nextKeyValue();
+        //we can't be sure of the order we are getting the segments, so we have to fudge here a bit on the id, but it is either id: i or i + 500
+        assertTrue("i = " + i + " docId= " + docId, String.valueOf(docId).equals(recordReader.getCurrentKey().toString()) || String.valueOf(docId+500).equals(recordReader.getCurrentKey().toString()));
+        assertEquals(NullWritable.get(), recordReader.getCurrentValue());
+        docId++;
+      }
+    }
   }
 }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java Thu Jun  6 21:57:50 2013
@@ -49,7 +49,7 @@ public class SequenceFilesFromLuceneStor
     conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
       + "org.apache.hadoop.io.serializer.WritableSerialization");
 
-    seqFilesOutputPath = new Path("seqfiles");
+    seqFilesOutputPath = new Path(getTestTempDirPath(), "seqfiles");
     idField = "id";
     fields = asList("field");
 
@@ -60,27 +60,26 @@ public class SequenceFilesFromLuceneStor
         return lucene2SeqConf;
       }
     };
-
-    commitDocuments(new SingleFieldDocument("1", "Mahout is cool"));
-    commitDocuments(new SingleFieldDocument("2", "Mahout is cool"));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), new SingleFieldDocument("1", "Mahout is cool"));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), new SingleFieldDocument("2", "Mahout is cool"));
   }
 
   @After
   public void after() throws IOException {
     HadoopUtil.delete(conf, seqFilesOutputPath);
-    HadoopUtil.delete(conf, getIndexPath());
+    HadoopUtil.delete(conf, getIndexPath1());
   }
 
   @Test
   public void testNewLucene2SeqConfiguration() {
     lucene2SeqConf = driver.newLucene2SeqConfiguration(conf,
-      asList(new Path(getIndexPath().toString())),
+      asList(new Path(getIndexPath1().toString())),
       seqFilesOutputPath,
       idField,
       fields);
 
     assertEquals(conf, lucene2SeqConf.getConfiguration());
-    assertEquals(asList(getIndexPath()), lucene2SeqConf.getIndexPaths());
+    assertEquals(asList(getIndexPath1()), lucene2SeqConf.getIndexPaths());
     assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath());
     assertEquals(idField, lucene2SeqConf.getIdField());
     assertEquals(fields, lucene2SeqConf.getFields());
@@ -94,10 +93,10 @@ public class SequenceFilesFromLuceneStor
     String field1 = "field1";
     String field2 = "field2";
 
-    String[] args = new String[]{
-      "-d", getIndexPath().toString(),
+    String[] args = {
+      "-i", getIndexPath1AsFile().toString(),
       "-o", seqFilesOutputPath.toString(),
-      "-i", idField,
+      "-id", idField,
       "-f", field1 + "," + field2,
       "-q", queryField + ":" + queryTerm,
       "-n", maxHits,
@@ -106,8 +105,8 @@ public class SequenceFilesFromLuceneStor
 
     driver.setConf(conf);
     driver.run(args);
-
-    assertEquals(asList(getIndexPath()), lucene2SeqConf.getIndexPaths());
+    assertEquals(1, lucene2SeqConf.getIndexPaths().size());
+    assertEquals(getIndexPath1().toUri().getPath(), lucene2SeqConf.getIndexPaths().get(0).toUri().getPath());
     assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath());
     assertEquals(idField, lucene2SeqConf.getIdField());
     assertEquals(asList(field1, field2), lucene2SeqConf.getFields());
@@ -120,17 +119,18 @@ public class SequenceFilesFromLuceneStor
 
   @Test
   public void testRun_optionalArguments() throws Exception {
-    String[] args = new String[]{
-      "-d", getIndexPath().toString(),
+    String[] args = {
+      "-i", getIndexPath1AsFile().toString(),
       "-o", seqFilesOutputPath.toString(),
-      "-i", idField,
+      "-id", idField,
       "-f", StringUtils.join(fields, SequenceFilesFromLuceneStorageDriver.SEPARATOR_FIELDS)
     };
 
     driver.setConf(conf);
     driver.run(args);
 
-    assertEquals(asList(getIndexPath()), lucene2SeqConf.getIndexPaths());
+    assertEquals(1, lucene2SeqConf.getIndexPaths().size());
+    assertEquals(getIndexPath1().toUri().getPath(), lucene2SeqConf.getIndexPaths().get(0).toUri().getPath());
     assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath());
     assertEquals(idField, lucene2SeqConf.getIdField());
     assertEquals(fields, lucene2SeqConf.getFields());
@@ -142,10 +142,10 @@ public class SequenceFilesFromLuceneStor
 
   @Test(expected = IllegalArgumentException.class)
   public void testRun_invalidQuery() throws Exception {
-    String[] args = new String[]{
-      "-d", getIndexPath().toString(),
+    String[] args = {
+      "-i", getIndexPath1AsFile().toString(),
       "-o", seqFilesOutputPath.toString(),
-      "-i", idField,
+      "-id", idField,
       "-f", StringUtils.join(fields, SequenceFilesFromLuceneStorageDriver.SEPARATOR_FIELDS),
       "-q", "inva:lid:query"
     };

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java Thu Jun  6 21:57:50 2013
@@ -6,40 +6,29 @@ import org.apache.hadoop.io.Text;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Map;
 
 import static java.util.Arrays.asList;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
 
 public class SequenceFilesFromLuceneStorageMRJobTest extends AbstractLuceneStorageTest {
 
   private SequenceFilesFromLuceneStorageMRJob lucene2seq;
   private LuceneStorageConfiguration lucene2SeqConf;
-  private SingleFieldDocument document1;
-  private SingleFieldDocument document2;
-  private SingleFieldDocument document3;
-  private SingleFieldDocument document4;
 
   @Before
-  public void before() {
+  public void before() throws IOException {
     lucene2seq = new SequenceFilesFromLuceneStorageMRJob();
-
     Configuration configuration = new Configuration();
-    Path seqOutputPath = new Path("seqOutputPath");
-
-    lucene2SeqConf = new LuceneStorageConfiguration(configuration, asList(getIndexPath()), seqOutputPath, SingleFieldDocument.ID_FIELD, asList(SingleFieldDocument.FIELD));
-
-    document1 = new SingleFieldDocument("1", "This is test document 1");
-    document2 = new SingleFieldDocument("2", "This is test document 2");
-    document3 = new SingleFieldDocument("3", "This is test document 3");
-    document4 = new SingleFieldDocument("4", "This is test document 4");
+    Path seqOutputPath = new Path(getTestTempDirPath(), "seqOutputPath");//don't make the output directory
+    lucene2SeqConf = new LuceneStorageConfiguration(configuration, asList(getIndexPath1(), getIndexPath2()),
+            seqOutputPath, SingleFieldDocument.ID_FIELD, asList(SingleFieldDocument.FIELD));
   }
 
   @After
@@ -50,15 +39,31 @@ public class SequenceFilesFromLuceneStor
 
   @Test
   public void testRun() throws IOException {
-    commitDocuments(document1, document2, document3, document4);
-
+    //Two commit points, each in two diff. Directories
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(1000, 1500));
+
+    commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(500, 1000));
+    commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(1500, 2000));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), misshapenDocs);
     lucene2seq.run(lucene2SeqConf);
 
     Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
-    assertSimpleDocumentEquals(document1, iterator.next());
-    assertSimpleDocumentEquals(document2, iterator.next());
-    assertSimpleDocumentEquals(document3, iterator.next());
-    assertSimpleDocumentEquals(document4, iterator.next());
+    Map<String, Text> map = new HashMap<String, Text>();
+    while (iterator.hasNext()) {
+      Pair<Text, Text> next = iterator.next();
+      map.put(next.getFirst().toString(), next.getSecond());
+    }
+    assertEquals(docs.size() + misshapenDocs.size(), map.size());
+    for (SingleFieldDocument doc : docs) {
+      Text value = map.get(doc.getId());
+      assertNotNull(value);
+      assertEquals(value.toString(), doc.getField());
+    }
+    for (SingleFieldDocument doc : misshapenDocs) {
+      Text value = map.get(doc.getId());
+      assertNotNull(value);
+      assertEquals(value.toString(), doc.getField());
+    }
   }
 }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java Thu Jun  6 21:57:50 2013
@@ -17,7 +17,11 @@ import org.junit.Before;
 import org.junit.Test;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
 
 import static java.util.Arrays.asList;
 import static org.junit.Assert.assertFalse;
@@ -26,10 +30,6 @@ public class SequenceFilesFromLuceneStor
 
   private SequenceFilesFromLuceneStorage lucene2Seq;
   private LuceneStorageConfiguration lucene2SeqConf;
-
-  private SingleFieldDocument document1;
-  private SingleFieldDocument document2;
-  private SingleFieldDocument document3;
   private Path seqFilesOutputPath;
   private Configuration configuration;
 
@@ -37,18 +37,15 @@ public class SequenceFilesFromLuceneStor
   @Before
   public void before() throws IOException {
     configuration = new Configuration();
-    seqFilesOutputPath = new Path("seqfiles");
+    seqFilesOutputPath = new Path(getTestTempDirPath(), "seqfiles");
 
     lucene2Seq = new SequenceFilesFromLuceneStorage();
     lucene2SeqConf = new LuceneStorageConfiguration(configuration,
-      asList(getIndexPath()),
+      asList(getIndexPath1(), getIndexPath2()),
       seqFilesOutputPath,
       SingleFieldDocument.ID_FIELD,
       asList(SingleFieldDocument.FIELD));
 
-    document1 = new SingleFieldDocument("1", "This is test document 1");
-    document2 = new SingleFieldDocument("2", "This is test document 2");
-    document3 = new SingleFieldDocument("3", "This is test document 3");
   }
 
   @After
@@ -57,55 +54,44 @@ public class SequenceFilesFromLuceneStor
     HadoopUtil.delete(lucene2SeqConf.getConfiguration(), lucene2SeqConf.getIndexPaths());
   }
 
-  @SuppressWarnings("unchecked")
   @Test
-  public void testRun() throws Exception {
-    commitDocuments(document1, document2, document3);
-
+  public void testRun2Directories() throws Exception {
+    //Two commit points, each in two diff. Directories
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(1000, 1500));
+
+    commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(500, 1000));
+    commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(1500, 2000));
+
+    commitDocuments(getDirectory(getIndexPath1AsFile()), misshapenDocs);
     lucene2Seq.run(lucene2SeqConf);
 
     Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
-    assertSimpleDocumentEquals(document1, iterator.next());
-    assertSimpleDocumentEquals(document2, iterator.next());
-    assertSimpleDocumentEquals(document3, iterator.next());
-  }
-
-  @SuppressWarnings("unchecked")
-  @Test
-  public void testRun_skipEmptyIdFieldDocs() throws IOException {
-    commitDocuments(document1, new SingleFieldDocument("", "This is a test document with no id"), document2);
-
-    lucene2Seq.run(lucene2SeqConf);
-
-    Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
-    assertSimpleDocumentEquals(document1, iterator.next());
-    assertSimpleDocumentEquals(document2, iterator.next());
-    assertFalse(iterator.hasNext());
-  }
-
-  @SuppressWarnings("unchecked")
-  @Test
-  public void testRun_skipEmptyFieldDocs() throws IOException {
-    commitDocuments(document1, new SingleFieldDocument("4", ""), document2);
-
-    lucene2Seq.run(lucene2SeqConf);
-
-    Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
-    assertSimpleDocumentEquals(document1, iterator.next());
-    assertSimpleDocumentEquals(document2, iterator.next());
-    assertFalse(iterator.hasNext());
+    Map<String, Text> map = new HashMap<String, Text>();
+    while (iterator.hasNext()) {
+      Pair<Text, Text> next = iterator.next();
+      map.put(next.getFirst().toString(), next.getSecond());
+    }
+    assertEquals(docs.size() + misshapenDocs.size(), map.size());
+    for (SingleFieldDocument doc : docs) {
+      Text value = map.get(doc.getId());
+      assertNotNull(value);
+      assertEquals(value.toString(), doc.getField());
+    }
+    for (SingleFieldDocument doc : misshapenDocs) {
+      Text value = map.get(doc.getId());
+      assertNotNull(value);
+      assertEquals(value.toString(), doc.getField());
+    }
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void testRun_skipUnstoredFields() throws IOException {
-    commitDocuments(new UnstoredFieldsDocument("5", "This is test document 5"));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), new UnstoredFieldsDocument("5", "This is test document 5"));
 
-    lucene2SeqConf = new LuceneStorageConfiguration(configuration,
-      asList(getIndexPath()),
+    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+      asList(getIndexPath1()),
       seqFilesOutputPath,
       SingleFieldDocument.ID_FIELD,
       asList(UnstoredFieldsDocument.FIELD, UnstoredFieldsDocument.UNSTORED_FIELD));
@@ -121,39 +107,51 @@ public class SequenceFilesFromLuceneStor
   @SuppressWarnings("unchecked")
   @Test
   public void testRun_maxHits() throws IOException {
-    commitDocuments(document1, document2, document3, new SingleFieldDocument("4", "This is test document 4"));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(1000, 1500));
+
+    commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(500, 1000));
+    commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(1500, 2000));
 
     lucene2SeqConf.setMaxHits(3);
     lucene2Seq.run(lucene2SeqConf);
 
     Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
-    assertSimpleDocumentEquals(document1, iterator.next());
-    assertSimpleDocumentEquals(document2, iterator.next());
-    assertSimpleDocumentEquals(document3, iterator.next());
+    assertTrue(iterator.hasNext());
+    iterator.next();
+    assertTrue(iterator.hasNext());
+    iterator.next();
+    assertTrue(iterator.hasNext());
+    iterator.next();
     assertFalse(iterator.hasNext());
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void testRun_query() throws IOException {
-    commitDocuments(document1, document2, document3, new SingleFieldDocument("4", "Mahout is cool"));
+    commitDocuments(getDirectory(getIndexPath1AsFile()), docs);
+    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+      asList(getIndexPath1()),
+      seqFilesOutputPath,
+      SingleFieldDocument.ID_FIELD,
+      asList(UnstoredFieldsDocument.FIELD, UnstoredFieldsDocument.UNSTORED_FIELD));
 
-    Query query = new TermQuery(new Term(lucene2SeqConf.getFields().get(0), "mahout"));
+    Query query = new TermQuery(new Term(lucene2SeqConf.getFields().get(0), "599"));
 
     lucene2SeqConf.setQuery(query);
     lucene2Seq.run(lucene2SeqConf);
 
     Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
-    assertSimpleDocumentEquals(new SingleFieldDocument("4", "Mahout is cool"), iterator.next());
+    assertTrue(iterator.hasNext());
+    Pair<Text, Text> next = iterator.next();
+    assertTrue(next.getSecond().toString().contains("599"));
     assertFalse(iterator.hasNext());
   }
 
   @Test
   public void testRun_multipleFields() throws IOException {
-    lucene2SeqConf = new LuceneStorageConfiguration(configuration,
-      asList(getIndexPath()),
+    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+      asList(getIndexPath1()),
       seqFilesOutputPath,
       SingleFieldDocument.ID_FIELD,
       asList(MultipleFieldsDocument.FIELD, MultipleFieldsDocument.FIELD1, MultipleFieldsDocument.FIELD2));
@@ -161,7 +159,7 @@ public class SequenceFilesFromLuceneStor
     MultipleFieldsDocument multipleFieldsDocument1 = new MultipleFieldsDocument("1", "This is field 1-1", "This is field 1-2", "This is field 1-3");
     MultipleFieldsDocument multipleFieldsDocument2 = new MultipleFieldsDocument("2", "This is field 2-1", "This is field 2-2", "This is field 2-3");
     MultipleFieldsDocument multipleFieldsDocument3 = new MultipleFieldsDocument("3", "This is field 3-1", "This is field 3-2", "This is field 3-3");
-    commitDocuments(multipleFieldsDocument1, multipleFieldsDocument2, multipleFieldsDocument3);
+    commitDocuments(getDirectory(getIndexPath1AsFile()), multipleFieldsDocument1, multipleFieldsDocument2, multipleFieldsDocument3);
 
     lucene2Seq.run(lucene2SeqConf);
 
@@ -174,8 +172,8 @@ public class SequenceFilesFromLuceneStor
 
   @Test
   public void testRun_numericField() throws IOException {
-    lucene2SeqConf = new LuceneStorageConfiguration(configuration,
-      asList(getIndexPath()),
+    LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+      asList(getIndexPath1()),
       seqFilesOutputPath,
       SingleFieldDocument.ID_FIELD,
       asList(NumericFieldDocument.FIELD, NumericFieldDocument.NUMERIC_FIELD));
@@ -184,7 +182,7 @@ public class SequenceFilesFromLuceneStor
     NumericFieldDocument doc2 = new NumericFieldDocument("2", "This is field 2", 200);
     NumericFieldDocument doc3 = new NumericFieldDocument("3", "This is field 3", 300);
 
-    commitDocuments(doc1, doc2, doc3);
+    commitDocuments(getDirectory(getIndexPath1AsFile()), doc1, doc2, doc3);
 
     lucene2Seq.run(lucene2SeqConf);