You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2013/06/06 23:57:50 UTC
svn commit: r1490457 - in /mahout/trunk: ./
integration/src/main/java/org/apache/mahout/text/
integration/src/test/java/org/apache/mahout/text/
Author: gsingers
Date: Thu Jun 6 21:57:50 2013
New Revision: 1490457
URL: http://svn.apache.org/r1490457
Log:
MAHOUT-944: fix the things that should have been committed the first time
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Thu Jun 6 21:57:50 2013
@@ -85,4 +85,6 @@ __MAHOUT-1181: Adding StreamingKMeans Ma
MAHOUT-1108: Allows cluster-reuters.sh example to be executed on a cluster (elmer.garduno via gsingers)
- MAHOUT-961: Fix issue in decision forest tree visualizer to properly show stems of tree (Ikumasa Mukai via gsingers)
\ No newline at end of file
+ MAHOUT-961: Fix issue in decision forest tree visualizer to properly show stems of tree (Ikumasa Mukai via gsingers)
+
+ MAHOUT-944: Create SequenceFiles out of Lucene document storage (no term vectors required) (Frank Scholten, gsingers)
\ No newline at end of file
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneIndexFileNameFilter.java Thu Jun 6 21:57:50 2013
@@ -1,11 +1,11 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
+package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
@@ -16,13 +16,12 @@
* limitations under the License.
*/
-package org.apache.mahout.text;
-
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
-import org.apache.lucene.index.IndexFileNameFilter;
import org.apache.lucene.index.IndexFileNames;
+import java.util.regex.Pattern;
+
/**
* A wrapper class to convert an IndexFileNameFilter which implements
* java.io.FilenameFilter to an org.apache.hadoop.fs.PathFilter.
@@ -41,18 +40,26 @@ class LuceneIndexFileNameFilter implemen
return singleton;
}
- //nocommit Not sure what the alternative is here
- private final IndexFileNameFilter luceneFilter;
-
private LuceneIndexFileNameFilter() {
- luceneFilter = IndexFileNames.getFilter();
}
+ //TODO: Lucene defines this in IndexFileNames, but it is package private, so make sure it doesn't change w/ new releases.
+ private static final Pattern CODEC_FILE_PATTERN = Pattern.compile("_[a-z0-9]+(_.*)?\\..*");
+
/* (non-Javadoc)
* @see org.apache.hadoop.fs.PathFilter#accept(org.apache.hadoop.fs.Path)
*/
public boolean accept(Path path) {
- return luceneFilter.accept(null, path.getName());
+ String name = path.getName();
+ if (CODEC_FILE_PATTERN.matcher(name).matches() || name.startsWith(IndexFileNames.SEGMENTS)) {
+ return true;
+ }
+ for (String extension : IndexFileNames.INDEX_EXTENSIONS) {
+ if (name.endsWith(extension)) {
+ return true;
+ }
+ }
+ return false;
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java Thu Jun 6 21:57:50 2013
@@ -1,4 +1,20 @@
package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java Thu Jun 6 21:57:50 2013
@@ -1,4 +1,20 @@
package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
@@ -39,12 +55,12 @@ public class LuceneSegmentRecordReader e
LuceneStorageConfiguration lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration);
SegmentInfoPerCommit segmentInfo = inputSplit.getSegment(configuration);
- segmentReader = new SegmentReader(segmentInfo, USE_TERM_INFOS, IOContext.READ);//nocommit: Should we use IOContext.READONCE?
+ segmentReader = new SegmentReader(segmentInfo, USE_TERM_INFOS, IOContext.READ);
searcher = new IndexSearcher(segmentReader);
Weight weight = lucene2SeqConfiguration.getQuery().createWeight(searcher);
- scorer = weight.scorer(segmentReader.getContext(), true, false, null);//nocommit
+ scorer = weight.scorer(segmentReader.getContext(), false, false, null);
}
@Override
@@ -67,7 +83,7 @@ public class LuceneSegmentRecordReader e
@Override
public float getProgress() throws IOException, InterruptedException {
- return 0;
+ return scorer.cost() == 0 ? 0 : (float) nextDocId / scorer.cost();//this is a rough estimate, due to the possible inaccuracies of cost
}
@Override
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java Thu Jun 6 21:57:50 2013
@@ -1,4 +1,5 @@
-/**
+package org.apache.mahout.text;
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -15,13 +16,12 @@
* limitations under the License.
*/
-package org.apache.mahout.text;
-
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
@@ -32,6 +32,7 @@ import org.apache.lucene.queryparser.cla
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
@@ -144,7 +145,7 @@ public class LuceneStorageConfiguration
* @return iterator
*/
public Iterator<Pair<Text, Text>> getSequenceFileIterator() {
- return new SequenceFileDirIterable<Text, Text>(sequenceFilesOutputPath, PathType.LIST, configuration).iterator();
+ return new SequenceFileDirIterable<Text, Text>(sequenceFilesOutputPath, PathType.LIST, PathFilters.logsCRCFilter(), configuration).iterator();
}
public Configuration getConfiguration() {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java Thu Jun 6 21:57:50 2013
@@ -35,10 +35,10 @@ import org.apache.lucene.store.Lock;
import java.io.IOException;
import java.util.Collection;
-//NOCOMMIT: Not sure if there isn't a better way of doing this in 4.x. Don't we have a Hadoop Directory impl somewhere?
+//TODO: is there a better way of doing this in Lucene 4.x?
/**
- * This class implements a Lucene Directory on top of a general FileSystem.
+ * This class implements a read-only Lucene Directory on top of a general FileSystem.
* Currently it does not support locking.
* <p/>
* // TODO: Rename to FileSystemReadOnlyDirectory
@@ -105,10 +105,10 @@ public class ReadOnlyFileSystemDirectory
// clear old index files
FileStatus[] fileStatus =
fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
- for (int i = 0; i < fileStatus.length; i++) {
- if (!fs.delete(fileStatus[i].getPath(), true)) {
+ for (FileStatus status : fileStatus) {
+ if (!fs.delete(status.getPath(), true)) {
throw new IOException("Cannot delete index file "
- + fileStatus[i].getPath());
+ + status.getPath());
}
}
}
@@ -177,7 +177,7 @@ public class ReadOnlyFileSystemDirectory
@Override
public IndexOutput createOutput(String name, IOContext context) throws IOException {
- //nocommit What should we be doing with the IOContext here?
+ //TODO: What should we be doing with the IOContext here, if anything?
Path file = new Path(directory, name);
if (fs.exists(file) && !fs.delete(file, true)) {
// delete the existing one if applicable
@@ -189,7 +189,7 @@ public class ReadOnlyFileSystemDirectory
@Override
public void sync(Collection<String> names) throws IOException {
-
+ // do nothing, as this is read-only
}
@Override
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java Thu Jun 6 21:57:50 2013
@@ -1,4 +1,5 @@
-/**
+package org.apache.mahout.text;
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -15,8 +16,6 @@
* limitations under the License.
*/
-package org.apache.mahout.text;
-
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -29,6 +28,7 @@ import org.apache.lucene.index.AtomicRea
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
@@ -50,9 +50,6 @@ import static org.apache.lucene.search.D
* Configure this class with a {@link LuceneStorageConfiguration} bean.
*/
public class SequenceFilesFromLuceneStorage {
-
- public static final String SEPARATOR_FIELDS = " ";
-
private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromLuceneStorage.class);
/**
@@ -61,75 +58,75 @@ public class SequenceFilesFromLuceneStor
* @param lucene2seqConf configuration bean
* @throws java.io.IOException if index cannot be opened or sequence file could not be written
*/
- public void run(LuceneStorageConfiguration lucene2seqConf) throws IOException {
+ public void run(final LuceneStorageConfiguration lucene2seqConf) throws IOException {
List<Path> indexPaths = lucene2seqConf.getIndexPaths();
+ int processedDocs = 0;
for (Path indexPath : indexPaths) {
- Directory directory = FSDirectory.open(new File(indexPath.toString()));
+ Directory directory = FSDirectory.open(new File(indexPath.toUri().getPath()));
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Configuration configuration = lucene2seqConf.getConfiguration();
FileSystem fileSystem = FileSystem.get(configuration);
- Path sequenceFilePath = new Path(lucene2seqConf.getSequenceFilesOutputPath(), indexPath);
- SequenceFile.Writer sequenceFileWriter = new SequenceFile.Writer(fileSystem, configuration, sequenceFilePath, Text.class, Text.class);
+ Path sequenceFilePath = new Path(lucene2seqConf.getSequenceFilesOutputPath(), indexPath.getName());
+ final SequenceFile.Writer sequenceFileWriter = new SequenceFile.Writer(fileSystem, configuration, sequenceFilePath, Text.class, Text.class);
- Text key = new Text();
- Text value = new Text();
-
- Weight weight = lucene2seqConf.getQuery().createWeight(searcher);
- //TODO: as the name implies, this is slow, but this is sequential anyway, so not a big deal. Better perf. would be by looping on the segments
- AtomicReaderContext context = SlowCompositeReaderWrapper.wrap(reader).getContext();
-
- Scorer scorer = weight.scorer(context, true, false, null);
-
- if (scorer != null) {
- int processedDocs = 0;
- int docId;
-
- while ((docId = scorer.nextDoc()) != NO_MORE_DOCS && processedDocs < lucene2seqConf.getMaxHits()) {
- DocumentStoredFieldVisitor storedFieldVisitor = lucene2seqConf.getStoredFieldVisitor();
- reader.document(docId, storedFieldVisitor);
- Document doc = storedFieldVisitor.getDocument();
- String idValue = doc.get(lucene2seqConf.getIdField());
-
- StringBuilder fieldValueBuilder = new StringBuilder();
- List<String> fields = lucene2seqConf.getFields();
- for (int i = 0; i < fields.size(); i++) {
- String field = fields.get(i);
- String fieldValue = doc.get(field);
- if (isNotBlank(fieldValue)) {
- fieldValueBuilder.append(fieldValue);
- if (i != fields.size() - 1) {
- fieldValueBuilder.append(SEPARATOR_FIELDS);
- }
- }
- }
-
- if (isBlank(idValue) || isBlank(fieldValueBuilder.toString())) {
- continue;
- }
+ SeqFileWriterCollector writerCollector = new SeqFileWriterCollector(lucene2seqConf, sequenceFileWriter, processedDocs);
+ searcher.search(lucene2seqConf.getQuery(), writerCollector);
+ log.info("Wrote " + writerCollector.processedDocs + " documents in " + sequenceFilePath.toUri());
+ processedDocs = writerCollector.processedDocs;
+ Closeables.close(sequenceFileWriter, true);
+ directory.close();
+ //searcher.close();
+ reader.close();
+ }
+ }
- key.set(idValue);
- value.set(fieldValueBuilder.toString());
+ private static class SeqFileWriterCollector extends Collector {
+ private final LuceneStorageConfiguration lucene2seqConf;
+ private final SequenceFile.Writer sequenceFileWriter;
+ public int processedDocs;
+ AtomicReaderContext arc;
+
+ SeqFileWriterCollector(LuceneStorageConfiguration lucene2seqConf, SequenceFile.Writer sequenceFileWriter, int processedDocs) {
+ this.lucene2seqConf = lucene2seqConf;
+ this.sequenceFileWriter = sequenceFileWriter;
+ this.processedDocs = processedDocs;
+ }
- sequenceFileWriter.append(key, value);
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ //don't care about scoring, we just want the matches
+ }
- processedDocs++;
+ @Override
+ public void collect(int docNum) throws IOException {
+ if (processedDocs < lucene2seqConf.getMaxHits()) {
+ final DocumentStoredFieldVisitor storedFieldVisitor = lucene2seqConf.getStoredFieldVisitor();
+ arc.reader().document(docNum, storedFieldVisitor);
+
+ Document doc = storedFieldVisitor.getDocument();
+ List<String> fields = lucene2seqConf.getFields();
+ Text theKey = new Text(LuceneSeqFileHelper.nullSafe(doc.get(lucene2seqConf.getIdField())));
+ Text theValue = new Text();
+ LuceneSeqFileHelper.populateValues(doc, theValue, fields);
+ //if they are both empty, don't write
+ if (isBlank(theKey.toString()) && isBlank(theValue.toString())) {
+ return;
}
-
- log.info("Wrote " + processedDocs + " documents in " + sequenceFilePath.toUri());
- } else {
- Closeables.close(sequenceFileWriter, true);
- directory.close();
- //searcher.close();
- reader.close();
- throw new RuntimeException("Could not write sequence files. Could not create scorer");
+ sequenceFileWriter.append(theKey, theValue);
+ processedDocs++;
}
+ }
- Closeables.close(sequenceFileWriter, true);
- directory.close();
- //searcher.close();
- reader.close();
+ @Override
+ public void setNextReader(AtomicReaderContext context) throws IOException {
+ arc = context;
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
}
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java Thu Jun 6 21:57:50 2013
@@ -1,4 +1,5 @@
-/**
+package org.apache.mahout.text;
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -15,8 +16,6 @@
* limitations under the License.
*/
-package org.apache.mahout.text;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
@@ -32,6 +31,7 @@ import org.apache.mahout.common.commandl
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Pattern;
import static java.util.Arrays.asList;
@@ -42,7 +42,6 @@ import static java.util.Arrays.asList;
*/
public class SequenceFilesFromLuceneStorageDriver extends AbstractJob {
- static final String OPTION_LUCENE_DIRECTORY = "dir";
static final String OPTION_ID_FIELD = "idField";
static final String OPTION_FIELD = "fields";
static final String OPTION_QUERY = "query";
@@ -53,6 +52,7 @@ public class SequenceFilesFromLuceneStor
static final String SEPARATOR_FIELDS = ",";
static final String QUERY_DELIMITER = "'";
+ private static final Pattern COMPILE = Pattern.compile(QUERY_DELIMITER);
public static void main(String[] args) throws Exception {
ToolRunner.run(new SequenceFilesFromLuceneStorageDriver(), args);
@@ -61,9 +61,9 @@ public class SequenceFilesFromLuceneStor
@Override
public int run(String[] args) throws Exception {
addOutputOption();
-
- addOption(OPTION_LUCENE_DIRECTORY, "d", "Lucene directory / directories. Comma separated.", true);
- addOption(OPTION_ID_FIELD, "i", "The field in the index containing the id", true);
+ addInputOption();
+ //addOption(OPTION_LUCENE_DIRECTORY, "d", "Lucene directory / directories. Comma separated.", true);
+ addOption(OPTION_ID_FIELD, "id", "The field in the index containing the id", true);
addOption(OPTION_FIELD, "f", "The stored field(s) in the index containing text", true);
addOption(OPTION_QUERY, "q", "(Optional) Lucene query. Defaults to " + DEFAULT_QUERY.getClass().getSimpleName());
@@ -79,13 +79,13 @@ public class SequenceFilesFromLuceneStor
configuration = new Configuration();
}
- String[] paths = getOption(OPTION_LUCENE_DIRECTORY).split(",");
+ String[] paths = getInputPath().toString().split(",");
List<Path> indexPaths = new ArrayList<Path>();
for (String path : paths) {
indexPaths.add(new Path(path));
}
- Path sequenceFilesOutputPath = new Path((getOption(DefaultOptionCreator.OUTPUT_OPTION)));
+ Path sequenceFilesOutputPath = getOutputPath();
String idField = getOption(OPTION_ID_FIELD);
String fields = getOption(OPTION_FIELD);
@@ -99,8 +99,8 @@ public class SequenceFilesFromLuceneStor
Query query = DEFAULT_QUERY;
if (hasOption(OPTION_QUERY)) {
try {
- String queryString = getOption(OPTION_QUERY).replaceAll(QUERY_DELIMITER, "");
- QueryParser queryParser = new QueryParser(Version.LUCENE_35, queryString, new StandardAnalyzer(Version.LUCENE_35));
+ String queryString = COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll("");
+ QueryParser queryParser = new QueryParser(Version.LUCENE_43, queryString, new StandardAnalyzer(Version.LUCENE_43));
query = queryParser.parse(queryString);
} catch (ParseException e) {
throw new IllegalArgumentException(e.getMessage(), e);
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJob.java Thu Jun 6 21:57:50 2013
@@ -1,4 +1,20 @@
package org.apache.mahout.text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMapper.java Thu Jun 6 21:57:50 2013
@@ -5,6 +5,7 @@ import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.SegmentInfoPerCommit;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.store.IOContext;
@@ -12,7 +13,7 @@ import org.apache.lucene.store.IOContext
import java.io.IOException;
import java.util.List;
-import static org.apache.commons.lang.StringUtils.isNotBlank;
+import static org.apache.commons.lang.StringUtils.isBlank;
/**
* Maps document IDs to key value pairs with ID field as the key and the concatenated stored field(s)
@@ -20,66 +21,45 @@ import static org.apache.commons.lang.St
*/
public class SequenceFilesFromLuceneStorageMapper extends Mapper<Text, NullWritable, Text, Text> {
- public static final String SEPARATOR_FIELDS = " ";
- public static final int USE_TERM_INFOS = 1;
+ public enum DataStatus {EMPTY_KEY, EMPTY_VALUE, EMPTY_BOTH};
- private LuceneStorageConfiguration lucene2SeqConfiguration;
+ private LuceneStorageConfiguration l2sConf;
private SegmentReader segmentReader;
- private Text idKey;
- private Text fieldValue;
-
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration configuration = context.getConfiguration();
-
- lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration);
-
+ l2sConf = new LuceneStorageConfiguration(configuration);
LuceneSegmentInputSplit inputSplit = (LuceneSegmentInputSplit) context.getInputSplit();
-
SegmentInfoPerCommit segmentInfo = inputSplit.getSegment(configuration);
- segmentReader = new SegmentReader(segmentInfo, USE_TERM_INFOS, IOContext.READ);//nocommit: Should we use IOContext.READONCE?
-
- idKey = new Text();
- fieldValue = new Text();
+ segmentReader = new SegmentReader(segmentInfo, LuceneSeqFileHelper.USE_TERM_INFOS, IOContext.READ);
}
@Override
protected void map(Text key, NullWritable text, Context context) throws IOException, InterruptedException {
int docId = Integer.valueOf(key.toString());
- Document document = segmentReader.document(docId);
-
- String idString = document.get(lucene2SeqConfiguration.getIdField());
-
- StringBuilder valueBuilder = new StringBuilder();
- List<String> fields = lucene2SeqConfiguration.getFields();
- for (int i = 0; i < fields.size(); i++) {
- String field = fields.get(i);
- String fieldValue = document.get(field);
- if (isNotBlank(fieldValue)) {
- valueBuilder.append(fieldValue);
- if (i != fields.size() - 1) {
- valueBuilder.append(SEPARATOR_FIELDS);
- }
- }
+ DocumentStoredFieldVisitor storedFieldVisitor = l2sConf.getStoredFieldVisitor();
+ segmentReader.document(docId, storedFieldVisitor);
+ Document document = storedFieldVisitor.getDocument();
+ List<String> fields = l2sConf.getFields();
+ Text theKey = new Text(LuceneSeqFileHelper.nullSafe(document.get(l2sConf.getIdField())));
+ Text theValue = new Text();
+ LuceneSeqFileHelper.populateValues(document, theValue, fields);
+ //if they are both empty, don't write
+ if (isBlank(theKey.toString()) && isBlank(theValue.toString())) {
+ context.getCounter(DataStatus.EMPTY_BOTH).increment(1);
+ return;
}
-
- idKey.set(nullSafe(idString));
- fieldValue.set(nullSafe(valueBuilder.toString()));
-
- context.write(idKey, fieldValue);
+ if (isBlank(theKey.toString())){
+ context.getCounter(DataStatus.EMPTY_KEY).increment(1);
+ } else if (isBlank(theValue.toString())){
+ context.getCounter(DataStatus.EMPTY_VALUE).increment(1);
+ }
+ context.write(theKey, theValue);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
segmentReader.close();
}
-
- private String nullSafe(String value) {
- if (value == null) {
- return "";
- } else {
- return value;
- }
- }
}
\ No newline at end of file
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/AbstractLuceneStorageTest.java Thu Jun 6 21:57:50 2013
@@ -2,32 +2,50 @@ package org.apache.mahout.text;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.Pair;
import org.apache.mahout.text.doc.MultipleFieldsDocument;
import org.apache.mahout.text.doc.NumericFieldDocument;
import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
+import org.apache.mahout.utils.MahoutTestCase;
import java.io.File;
import java.io.IOException;
-
-import static org.junit.Assert.assertEquals;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
/**
* Abstract test for working with Lucene storage.
*/
-public abstract class AbstractLuceneStorageTest {
+public abstract class AbstractLuceneStorageTest extends MahoutTestCase {
- private Path indexPath = new Path("index");
+ protected Path indexPath1;
+ protected Path indexPath2;
+ protected List<SingleFieldDocument> docs = new ArrayList<SingleFieldDocument>();
+ protected List<SingleFieldDocument> misshapenDocs = new ArrayList<SingleFieldDocument>();
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ indexPath1 = getTestTempDirPath("index1");
+ indexPath2 = getTestTempDirPath("index2");
+ for (int i = 0; i < 2000; i++) {
+ docs.add(new SingleFieldDocument(String.valueOf(i), "This is test document " + i));
+ }
+ misshapenDocs.add(new SingleFieldDocument("", "This doc has an empty id"));
+ misshapenDocs.add(new SingleFieldDocument("empty_value", ""));
+ }
- protected void commitDocuments(SingleFieldDocument... documents) throws IOException {
- IndexWriter indexWriter = new IndexWriter(getDirectory(), new IndexWriterConfig(Version.LUCENE_35, new DefaultAnalyzer()));
+ protected void commitDocuments(Directory directory, Iterable<SingleFieldDocument> theDocs) throws IOException{
+ IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43)));
- for (SingleFieldDocument singleFieldDocument : documents) {
+ for (SingleFieldDocument singleFieldDocument : theDocs) {
indexWriter.addDocument(singleFieldDocument.asLuceneDocument());
}
@@ -35,6 +53,10 @@ public abstract class AbstractLuceneStor
indexWriter.close();
}
+ protected void commitDocuments(Directory directory, SingleFieldDocument... documents) throws IOException {
+ commitDocuments(directory, Arrays.asList(documents));
+ }
+
protected void assertSimpleDocumentEquals(SingleFieldDocument expected, Pair<Text, Text> actual) {
assertEquals(expected.getId(), actual.getFirst().toString());
assertEquals(expected.getField(), actual.getSecond().toString());
@@ -50,11 +72,23 @@ public abstract class AbstractLuceneStor
assertEquals(expected.getField() + " " + expected.getNumericField(), actual.getSecond().toString());
}
- protected FSDirectory getDirectory() throws IOException {
- return FSDirectory.open(new File(indexPath.toString()));
+ protected FSDirectory getDirectory(File indexPath) throws IOException {
+ return FSDirectory.open(indexPath);
+ }
+
+ protected File getIndexPath1AsFile() {
+ return new File(indexPath1.toUri().getPath());
+ }
+
+ protected Path getIndexPath1() {
+ return indexPath1;
+ }
+
+ protected File getIndexPath2AsFile() {
+ return new File(indexPath2.toUri().getPath());
}
- protected Path getIndexPath() {
- return indexPath;
+ protected Path getIndexPath2() {
+ return indexPath2;
}
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputFormatTest.java Thu Jun 6 21:57:50 2013
@@ -4,47 +4,36 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.junit.After;
+import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
-import java.io.File;
import java.io.IOException;
+import java.util.Collections;
import java.util.List;
-import static java.util.Arrays.asList;
-import static junit.framework.Assert.assertEquals;
-
-public class LuceneSegmentInputFormatTest {
+public class LuceneSegmentInputFormatTest extends AbstractLuceneStorageTest {
private LuceneSegmentInputFormat inputFormat;
private JobContext jobContext;
- private Path indexPath;
private Configuration conf;
- private FSDirectory directory;
@Before
public void before() throws IOException {
inputFormat = new LuceneSegmentInputFormat();
- indexPath = new Path("index");
-
- LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), asList(indexPath), new Path("output"), "id", asList("field"));
+ LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), Collections.singletonList(indexPath1), new Path("output"), "id", Collections.singletonList("field"));
conf = lucene2SeqConf.serialize();
jobContext = new JobContext(conf, new JobID());
- directory = FSDirectory.open(new File(indexPath.toString()));
}
-
+
@After
public void after() throws IOException {
- HadoopUtil.delete(conf, indexPath);
+ HadoopUtil.delete(conf, indexPath1);
}
@Test
@@ -52,21 +41,13 @@ public class LuceneSegmentInputFormatTes
SingleFieldDocument doc1 = new SingleFieldDocument("1", "This is simple document 1");
SingleFieldDocument doc2 = new SingleFieldDocument("2", "This is simple document 2");
SingleFieldDocument doc3 = new SingleFieldDocument("3", "This is simple document 3");
- List<SingleFieldDocument> documents = asList(doc1, doc2, doc3);
- for (SingleFieldDocument singleFieldDocument : documents) {
- commitDocument(singleFieldDocument);
- }
+ //generate 3 segments
+ commitDocuments(getDirectory(getIndexPath1AsFile()), doc1);
+ commitDocuments(getDirectory(getIndexPath1AsFile()), doc2);
+ commitDocuments(getDirectory(getIndexPath1AsFile()), doc3);
List<LuceneSegmentInputSplit> splits = inputFormat.getSplits(jobContext);
- assertEquals(3, splits.size());
- }
-
- private void commitDocument(SingleFieldDocument doc) throws IOException {
- IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new DefaultAnalyzer());
- IndexWriter indexWriter = new IndexWriter(directory, conf);
- indexWriter.addDocument(doc.asLuceneDocument());
- indexWriter.commit();
- indexWriter.close();
+ Assert.assertEquals(3, splits.size());
}
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentInputSplitTest.java Thu Jun 6 21:57:50 2013
@@ -1,43 +1,36 @@
package org.apache.mahout.text;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentInfoPerCommit;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.store.IOContext;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import java.io.File;
import java.io.IOException;
import java.util.List;
import static java.util.Arrays.asList;
-import static junit.framework.Assert.assertEquals;
-public class LuceneSegmentInputSplitTest {
+public class LuceneSegmentInputSplitTest extends AbstractLuceneStorageTest {
private FSDirectory directory;
- private Path indexPath;
+
private Configuration conf;
@Before
public void before() throws IOException {
- indexPath = new Path("index");
- directory = FSDirectory.open(new File(indexPath.toString()));
+ directory = getDirectory(getIndexPath1AsFile());
conf = new Configuration();
}
@After
public void after() throws IOException {
- HadoopUtil.delete(conf, indexPath);
+ HadoopUtil.delete(conf, indexPath1);
}
@Test
@@ -48,7 +41,7 @@ public class LuceneSegmentInputSplitTest
List<SingleFieldDocument> docs = asList(doc1, doc2, doc3);
for (SingleFieldDocument doc : docs) {
- addDocument(doc);
+ commitDocuments(getDirectory(getIndexPath1AsFile()), doc);
}
assertSegmentContainsOneDoc("_0");
@@ -64,26 +57,20 @@ public class LuceneSegmentInputSplitTest
List<SingleFieldDocument> docs = asList(doc1, doc2, doc3);
for (SingleFieldDocument doc : docs) {
- addDocument(doc);
+ commitDocuments(getDirectory(getIndexPath1AsFile()), doc);
}
- LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath, "_3", 1000);
+ LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath1, "_3", 1000);
inputSplit.getSegment(conf);
}
private void assertSegmentContainsOneDoc(String segmentName) throws IOException {
- LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath, segmentName, 1000);
- SegmentInfo segment = inputSplit.getSegment(conf);
- SegmentReader segmentReader = SegmentReader.get(true, segment, 1);
- assertEquals(segmentName, segment.name);
+ LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(indexPath1, segmentName, 1000);
+ SegmentInfoPerCommit segment = inputSplit.getSegment(conf);
+ SegmentReader segmentReader = new SegmentReader(segment, 1, IOContext.READ);//SegmentReader.get(true, segment, 1);
+ assertEquals(segmentName, segment.info.name);
assertEquals(1, segmentReader.numDocs());
}
- private void addDocument(SingleFieldDocument doc) throws IOException {
- IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new DefaultAnalyzer());
- IndexWriter indexWriter = new IndexWriter(directory, conf);
- indexWriter.addDocument(doc.asLuceneDocument());
- indexWriter.commit();
- indexWriter.close();
- }
+
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/LuceneSegmentRecordReaderTest.java Thu Jun 6 21:57:50 2013
@@ -6,67 +6,53 @@ import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.lucene.index.*;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
+
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import java.io.File;
import java.io.IOException;
-import java.util.List;
import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
public class LuceneSegmentRecordReaderTest extends AbstractLuceneStorageTest {
-
- private LuceneSegmentRecordReader recordReader;
private Configuration configuration;
+
@Before
public void before() throws IOException, InterruptedException {
- LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), asList(getIndexPath()), new Path("output"), "id", asList("field"));
+ LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(new Configuration(), asList(getIndexPath1()), new Path("output"), "id", asList("field"));
configuration = lucene2SeqConf.serialize();
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(500, 1000));
- SingleFieldDocument doc1 = new SingleFieldDocument("1", "This is simple document 1");
- SingleFieldDocument doc2 = new SingleFieldDocument("2", "This is simple document 2");
- SingleFieldDocument doc3 = new SingleFieldDocument("3", "This is simple document 3");
-
- commitDocuments(doc1, doc2, doc3);
-
- SegmentInfos segmentInfos = new SegmentInfos();
- segmentInfos.read(getDirectory());
-
- SegmentInfo segmentInfo = segmentInfos.asList().get(0);
- LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath(), segmentInfo.name, segmentInfo.sizeInBytes(true));
-
- TaskAttemptContext context = new TaskAttemptContext(configuration, new TaskAttemptID());
-
- recordReader = new LuceneSegmentRecordReader();
- recordReader.initialize(inputSplit, context);
}
@After
public void after() throws IOException {
- HadoopUtil.delete(configuration, getIndexPath());
+ HadoopUtil.delete(configuration, getIndexPath1());
}
@Test
public void testKey() throws Exception {
- recordReader.nextKeyValue();
- assertEquals("0", recordReader.getCurrentKey().toString());
- recordReader.nextKeyValue();
- assertEquals("1", recordReader.getCurrentKey().toString());
- recordReader.nextKeyValue();
- assertEquals("2", recordReader.getCurrentKey().toString());
- }
-
- @Test
- public void testGetCurrentValue() throws Exception {
- assertEquals(NullWritable.get(), recordReader.getCurrentValue());
+ LuceneSegmentRecordReader recordReader = new LuceneSegmentRecordReader();
+ SegmentInfos segmentInfos = new SegmentInfos();
+ segmentInfos.read(getDirectory(getIndexPath1AsFile()));
+ for (SegmentInfoPerCommit segmentInfo : segmentInfos) {
+ int docId = 0;
+ LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit(getIndexPath1(), segmentInfo.info.name, segmentInfo.sizeInBytes());
+ TaskAttemptContext context = new TaskAttemptContext(configuration, new TaskAttemptID());
+ recordReader.initialize(inputSplit, context);
+ for (int i = 0; i < 500; i++){
+ recordReader.nextKeyValue();
+ //we can't be sure of the order we are getting the segments, so we have to fudge here a bit on the id, but it is either id: i or i + 500
+ assertTrue("i = " + i + " docId= " + docId, String.valueOf(docId).equals(recordReader.getCurrentKey().toString()) || String.valueOf(docId+500).equals(recordReader.getCurrentKey().toString()));
+ assertEquals(NullWritable.get(), recordReader.getCurrentValue());
+ docId++;
+ }
+ }
}
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriverTest.java Thu Jun 6 21:57:50 2013
@@ -49,7 +49,7 @@ public class SequenceFilesFromLuceneStor
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
- seqFilesOutputPath = new Path("seqfiles");
+ seqFilesOutputPath = new Path(getTestTempDirPath(), "seqfiles");
idField = "id";
fields = asList("field");
@@ -60,27 +60,26 @@ public class SequenceFilesFromLuceneStor
return lucene2SeqConf;
}
};
-
- commitDocuments(new SingleFieldDocument("1", "Mahout is cool"));
- commitDocuments(new SingleFieldDocument("2", "Mahout is cool"));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), new SingleFieldDocument("1", "Mahout is cool"));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), new SingleFieldDocument("2", "Mahout is cool"));
}
@After
public void after() throws IOException {
HadoopUtil.delete(conf, seqFilesOutputPath);
- HadoopUtil.delete(conf, getIndexPath());
+ HadoopUtil.delete(conf, getIndexPath1());
}
@Test
public void testNewLucene2SeqConfiguration() {
lucene2SeqConf = driver.newLucene2SeqConfiguration(conf,
- asList(new Path(getIndexPath().toString())),
+ asList(new Path(getIndexPath1().toString())),
seqFilesOutputPath,
idField,
fields);
assertEquals(conf, lucene2SeqConf.getConfiguration());
- assertEquals(asList(getIndexPath()), lucene2SeqConf.getIndexPaths());
+ assertEquals(asList(getIndexPath1()), lucene2SeqConf.getIndexPaths());
assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath());
assertEquals(idField, lucene2SeqConf.getIdField());
assertEquals(fields, lucene2SeqConf.getFields());
@@ -94,10 +93,10 @@ public class SequenceFilesFromLuceneStor
String field1 = "field1";
String field2 = "field2";
- String[] args = new String[]{
- "-d", getIndexPath().toString(),
+ String[] args = {
+ "-i", getIndexPath1AsFile().toString(),
"-o", seqFilesOutputPath.toString(),
- "-i", idField,
+ "-id", idField,
"-f", field1 + "," + field2,
"-q", queryField + ":" + queryTerm,
"-n", maxHits,
@@ -106,8 +105,8 @@ public class SequenceFilesFromLuceneStor
driver.setConf(conf);
driver.run(args);
-
- assertEquals(asList(getIndexPath()), lucene2SeqConf.getIndexPaths());
+ assertEquals(1, lucene2SeqConf.getIndexPaths().size());
+ assertEquals(getIndexPath1().toUri().getPath(), lucene2SeqConf.getIndexPaths().get(0).toUri().getPath());
assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath());
assertEquals(idField, lucene2SeqConf.getIdField());
assertEquals(asList(field1, field2), lucene2SeqConf.getFields());
@@ -120,17 +119,18 @@ public class SequenceFilesFromLuceneStor
@Test
public void testRun_optionalArguments() throws Exception {
- String[] args = new String[]{
- "-d", getIndexPath().toString(),
+ String[] args = {
+ "-i", getIndexPath1AsFile().toString(),
"-o", seqFilesOutputPath.toString(),
- "-i", idField,
+ "-id", idField,
"-f", StringUtils.join(fields, SequenceFilesFromLuceneStorageDriver.SEPARATOR_FIELDS)
};
driver.setConf(conf);
driver.run(args);
- assertEquals(asList(getIndexPath()), lucene2SeqConf.getIndexPaths());
+ assertEquals(1, lucene2SeqConf.getIndexPaths().size());
+ assertEquals(getIndexPath1().toUri().getPath(), lucene2SeqConf.getIndexPaths().get(0).toUri().getPath());
assertEquals(seqFilesOutputPath, lucene2SeqConf.getSequenceFilesOutputPath());
assertEquals(idField, lucene2SeqConf.getIdField());
assertEquals(fields, lucene2SeqConf.getFields());
@@ -142,10 +142,10 @@ public class SequenceFilesFromLuceneStor
@Test(expected = IllegalArgumentException.class)
public void testRun_invalidQuery() throws Exception {
- String[] args = new String[]{
- "-d", getIndexPath().toString(),
+ String[] args = {
+ "-i", getIndexPath1AsFile().toString(),
"-o", seqFilesOutputPath.toString(),
- "-i", idField,
+ "-id", idField,
"-f", StringUtils.join(fields, SequenceFilesFromLuceneStorageDriver.SEPARATOR_FIELDS),
"-q", "inva:lid:query"
};
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageMRJobTest.java Thu Jun 6 21:57:50 2013
@@ -6,40 +6,29 @@ import org.apache.hadoop.io.Text;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.text.doc.SingleFieldDocument;
-import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
+import java.util.HashMap;
import java.util.Iterator;
+import java.util.Map;
import static java.util.Arrays.asList;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
public class SequenceFilesFromLuceneStorageMRJobTest extends AbstractLuceneStorageTest {
private SequenceFilesFromLuceneStorageMRJob lucene2seq;
private LuceneStorageConfiguration lucene2SeqConf;
- private SingleFieldDocument document1;
- private SingleFieldDocument document2;
- private SingleFieldDocument document3;
- private SingleFieldDocument document4;
@Before
- public void before() {
+ public void before() throws IOException {
lucene2seq = new SequenceFilesFromLuceneStorageMRJob();
-
Configuration configuration = new Configuration();
- Path seqOutputPath = new Path("seqOutputPath");
-
- lucene2SeqConf = new LuceneStorageConfiguration(configuration, asList(getIndexPath()), seqOutputPath, SingleFieldDocument.ID_FIELD, asList(SingleFieldDocument.FIELD));
-
- document1 = new SingleFieldDocument("1", "This is test document 1");
- document2 = new SingleFieldDocument("2", "This is test document 2");
- document3 = new SingleFieldDocument("3", "This is test document 3");
- document4 = new SingleFieldDocument("4", "This is test document 4");
+ Path seqOutputPath = new Path(getTestTempDirPath(), "seqOutputPath");//don't make the output directory
+ lucene2SeqConf = new LuceneStorageConfiguration(configuration, asList(getIndexPath1(), getIndexPath2()),
+ seqOutputPath, SingleFieldDocument.ID_FIELD, asList(SingleFieldDocument.FIELD));
}
@After
@@ -50,15 +39,31 @@ public class SequenceFilesFromLuceneStor
@Test
public void testRun() throws IOException {
- commitDocuments(document1, document2, document3, document4);
-
+ //Two commit points, each in two diff. Directories
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(1000, 1500));
+
+ commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(500, 1000));
+ commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(1500, 2000));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), misshapenDocs);
lucene2seq.run(lucene2SeqConf);
Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
- assertSimpleDocumentEquals(document1, iterator.next());
- assertSimpleDocumentEquals(document2, iterator.next());
- assertSimpleDocumentEquals(document3, iterator.next());
- assertSimpleDocumentEquals(document4, iterator.next());
+ Map<String, Text> map = new HashMap<String, Text>();
+ while (iterator.hasNext()) {
+ Pair<Text, Text> next = iterator.next();
+ map.put(next.getFirst().toString(), next.getSecond());
+ }
+ assertEquals(docs.size() + misshapenDocs.size(), map.size());
+ for (SingleFieldDocument doc : docs) {
+ Text value = map.get(doc.getId());
+ assertNotNull(value);
+ assertEquals(value.toString(), doc.getField());
+ }
+ for (SingleFieldDocument doc : misshapenDocs) {
+ Text value = map.get(doc.getId());
+ assertNotNull(value);
+ assertEquals(value.toString(), doc.getField());
+ }
}
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java?rev=1490457&r1=1490456&r2=1490457&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageTest.java Thu Jun 6 21:57:50 2013
@@ -17,7 +17,11 @@ import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
import static java.util.Arrays.asList;
import static org.junit.Assert.assertFalse;
@@ -26,10 +30,6 @@ public class SequenceFilesFromLuceneStor
private SequenceFilesFromLuceneStorage lucene2Seq;
private LuceneStorageConfiguration lucene2SeqConf;
-
- private SingleFieldDocument document1;
- private SingleFieldDocument document2;
- private SingleFieldDocument document3;
private Path seqFilesOutputPath;
private Configuration configuration;
@@ -37,18 +37,15 @@ public class SequenceFilesFromLuceneStor
@Before
public void before() throws IOException {
configuration = new Configuration();
- seqFilesOutputPath = new Path("seqfiles");
+ seqFilesOutputPath = new Path(getTestTempDirPath(), "seqfiles");
lucene2Seq = new SequenceFilesFromLuceneStorage();
lucene2SeqConf = new LuceneStorageConfiguration(configuration,
- asList(getIndexPath()),
+ asList(getIndexPath1(), getIndexPath2()),
seqFilesOutputPath,
SingleFieldDocument.ID_FIELD,
asList(SingleFieldDocument.FIELD));
- document1 = new SingleFieldDocument("1", "This is test document 1");
- document2 = new SingleFieldDocument("2", "This is test document 2");
- document3 = new SingleFieldDocument("3", "This is test document 3");
}
@After
@@ -57,55 +54,44 @@ public class SequenceFilesFromLuceneStor
HadoopUtil.delete(lucene2SeqConf.getConfiguration(), lucene2SeqConf.getIndexPaths());
}
- @SuppressWarnings("unchecked")
@Test
- public void testRun() throws Exception {
- commitDocuments(document1, document2, document3);
-
+ public void testRun2Directories() throws Exception {
+ //Two commit points, each in two diff. Directories
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(1000, 1500));
+
+ commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(500, 1000));
+ commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(1500, 2000));
+
+ commitDocuments(getDirectory(getIndexPath1AsFile()), misshapenDocs);
lucene2Seq.run(lucene2SeqConf);
Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
- assertSimpleDocumentEquals(document1, iterator.next());
- assertSimpleDocumentEquals(document2, iterator.next());
- assertSimpleDocumentEquals(document3, iterator.next());
- }
-
- @SuppressWarnings("unchecked")
- @Test
- public void testRun_skipEmptyIdFieldDocs() throws IOException {
- commitDocuments(document1, new SingleFieldDocument("", "This is a test document with no id"), document2);
-
- lucene2Seq.run(lucene2SeqConf);
-
- Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
- assertSimpleDocumentEquals(document1, iterator.next());
- assertSimpleDocumentEquals(document2, iterator.next());
- assertFalse(iterator.hasNext());
- }
-
- @SuppressWarnings("unchecked")
- @Test
- public void testRun_skipEmptyFieldDocs() throws IOException {
- commitDocuments(document1, new SingleFieldDocument("4", ""), document2);
-
- lucene2Seq.run(lucene2SeqConf);
-
- Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
- assertSimpleDocumentEquals(document1, iterator.next());
- assertSimpleDocumentEquals(document2, iterator.next());
- assertFalse(iterator.hasNext());
+ Map<String, Text> map = new HashMap<String, Text>();
+ while (iterator.hasNext()) {
+ Pair<Text, Text> next = iterator.next();
+ map.put(next.getFirst().toString(), next.getSecond());
+ }
+ assertEquals(docs.size() + misshapenDocs.size(), map.size());
+ for (SingleFieldDocument doc : docs) {
+ Text value = map.get(doc.getId());
+ assertNotNull(value);
+ assertEquals(value.toString(), doc.getField());
+ }
+ for (SingleFieldDocument doc : misshapenDocs) {
+ Text value = map.get(doc.getId());
+ assertNotNull(value);
+ assertEquals(value.toString(), doc.getField());
+ }
}
@SuppressWarnings("unchecked")
@Test
public void testRun_skipUnstoredFields() throws IOException {
- commitDocuments(new UnstoredFieldsDocument("5", "This is test document 5"));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), new UnstoredFieldsDocument("5", "This is test document 5"));
- lucene2SeqConf = new LuceneStorageConfiguration(configuration,
- asList(getIndexPath()),
+ LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+ asList(getIndexPath1()),
seqFilesOutputPath,
SingleFieldDocument.ID_FIELD,
asList(UnstoredFieldsDocument.FIELD, UnstoredFieldsDocument.UNSTORED_FIELD));
@@ -121,39 +107,51 @@ public class SequenceFilesFromLuceneStor
@SuppressWarnings("unchecked")
@Test
public void testRun_maxHits() throws IOException {
- commitDocuments(document1, document2, document3, new SingleFieldDocument("4", "This is test document 4"));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(0, 500));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs.subList(1000, 1500));
+
+ commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(500, 1000));
+ commitDocuments(getDirectory(getIndexPath2AsFile()), docs.subList(1500, 2000));
lucene2SeqConf.setMaxHits(3);
lucene2Seq.run(lucene2SeqConf);
Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
- assertSimpleDocumentEquals(document1, iterator.next());
- assertSimpleDocumentEquals(document2, iterator.next());
- assertSimpleDocumentEquals(document3, iterator.next());
+ assertTrue(iterator.hasNext());
+ iterator.next();
+ assertTrue(iterator.hasNext());
+ iterator.next();
+ assertTrue(iterator.hasNext());
+ iterator.next();
assertFalse(iterator.hasNext());
}
@SuppressWarnings("unchecked")
@Test
public void testRun_query() throws IOException {
- commitDocuments(document1, document2, document3, new SingleFieldDocument("4", "Mahout is cool"));
+ commitDocuments(getDirectory(getIndexPath1AsFile()), docs);
+ LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+ asList(getIndexPath1()),
+ seqFilesOutputPath,
+ SingleFieldDocument.ID_FIELD,
+ asList(UnstoredFieldsDocument.FIELD, UnstoredFieldsDocument.UNSTORED_FIELD));
- Query query = new TermQuery(new Term(lucene2SeqConf.getFields().get(0), "mahout"));
+ Query query = new TermQuery(new Term(lucene2SeqConf.getFields().get(0), "599"));
lucene2SeqConf.setQuery(query);
lucene2Seq.run(lucene2SeqConf);
Iterator<Pair<Text, Text>> iterator = lucene2SeqConf.getSequenceFileIterator();
-
- assertSimpleDocumentEquals(new SingleFieldDocument("4", "Mahout is cool"), iterator.next());
+ assertTrue(iterator.hasNext());
+ Pair<Text, Text> next = iterator.next();
+ assertTrue(next.getSecond().toString().contains("599"));
assertFalse(iterator.hasNext());
}
@Test
public void testRun_multipleFields() throws IOException {
- lucene2SeqConf = new LuceneStorageConfiguration(configuration,
- asList(getIndexPath()),
+ LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+ asList(getIndexPath1()),
seqFilesOutputPath,
SingleFieldDocument.ID_FIELD,
asList(MultipleFieldsDocument.FIELD, MultipleFieldsDocument.FIELD1, MultipleFieldsDocument.FIELD2));
@@ -161,7 +159,7 @@ public class SequenceFilesFromLuceneStor
MultipleFieldsDocument multipleFieldsDocument1 = new MultipleFieldsDocument("1", "This is field 1-1", "This is field 1-2", "This is field 1-3");
MultipleFieldsDocument multipleFieldsDocument2 = new MultipleFieldsDocument("2", "This is field 2-1", "This is field 2-2", "This is field 2-3");
MultipleFieldsDocument multipleFieldsDocument3 = new MultipleFieldsDocument("3", "This is field 3-1", "This is field 3-2", "This is field 3-3");
- commitDocuments(multipleFieldsDocument1, multipleFieldsDocument2, multipleFieldsDocument3);
+ commitDocuments(getDirectory(getIndexPath1AsFile()), multipleFieldsDocument1, multipleFieldsDocument2, multipleFieldsDocument3);
lucene2Seq.run(lucene2SeqConf);
@@ -174,8 +172,8 @@ public class SequenceFilesFromLuceneStor
@Test
public void testRun_numericField() throws IOException {
- lucene2SeqConf = new LuceneStorageConfiguration(configuration,
- asList(getIndexPath()),
+ LuceneStorageConfiguration lucene2SeqConf = new LuceneStorageConfiguration(configuration,
+ asList(getIndexPath1()),
seqFilesOutputPath,
SingleFieldDocument.ID_FIELD,
asList(NumericFieldDocument.FIELD, NumericFieldDocument.NUMERIC_FIELD));
@@ -184,7 +182,7 @@ public class SequenceFilesFromLuceneStor
NumericFieldDocument doc2 = new NumericFieldDocument("2", "This is field 2", 200);
NumericFieldDocument doc3 = new NumericFieldDocument("3", "This is field 3", 300);
- commitDocuments(doc1, doc2, doc3);
+ commitDocuments(getDirectory(getIndexPath1AsFile()), doc1, doc2, doc3);
lucene2Seq.run(lucene2SeqConf);