You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ak...@apache.org on 2015/04/08 01:59:36 UTC
[1/5] mahout git commit: Revert "MAHOUT-1649: Upgrade to be Lucene
4.10.4 compatible, patch from Frank Scholten"
Repository: mahout
Updated Branches:
refs/heads/master 5e41bc096 -> daad3a4ce
Revert "MAHOUT-1649: Upgrade to be Lucene 4.10.4 compatible, patch from Frank Scholten"
This reverts commit 4cff54295d9e494eaff46a190b73bcb8d491f7e5.
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/db624ef3
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/db624ef3
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/db624ef3
Branch: refs/heads/master
Commit: db624ef35c8174e9bd5a9f63f9e2957e8bd594b8
Parents: c70d708
Author: Andrew Musselman <ak...@apache.org>
Authored: Mon Apr 6 09:22:52 2015 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Mon Apr 6 09:22:52 2015 -0700
----------------------------------------------------------------------
.../mahout/classifier/NewsgroupHelper.java | 25 +-
integration/pom.xml | 11 -
.../mahout/text/LuceneSegmentInputFormat.java | 5 +-
.../mahout/text/LuceneSegmentInputSplit.java | 14 +-
.../mahout/text/LuceneSegmentRecordReader.java | 4 +-
.../mahout/text/LuceneStorageConfiguration.java | 5 +-
.../text/MailArchivesClusteringAnalyzer.java | 21 +-
.../text/ReadOnlyFileSystemDirectory.java | 354 +++++++++++++++++++
.../text/SequenceFilesFromLuceneStorage.java | 6 +-
.../SequenceFilesFromLuceneStorageDriver.java | 4 +-
.../text/wikipedia/WikipediaAnalyzer.java | 11 +-
.../mahout/utils/regex/AnalyzerTransformer.java | 3 +-
mr/src/main/java/org/apache/mahout/Version.java | 6 +-
.../mahout/common/lucene/AnalyzerUtils.java | 4 +-
.../mahout/vectorizer/DictionaryVectorizer.java | 14 +-
.../mahout/vectorizer/DocumentProcessor.java | 2 +-
.../encoders/AdaptiveWordValueEncoder.java | 2 +-
.../encoders/ContinuousValueEncoder.java | 2 +-
.../mahout/vectorizer/encoders/Dictionary.java | 9 +-
.../encoders/FeatureVectorEncoder.java | 12 +-
.../encoders/InteractionValueEncoder.java | 5 +-
.../encoders/StaticWordValueEncoder.java | 4 +-
.../vectorizer/encoders/TextValueEncoder.java | 12 +-
.../vectorizer/term/TFPartialVectorReducer.java | 23 +-
.../mahout/vectorizer/tfidf/TFIDFConverter.java | 12 +-
.../encoders/TextValueEncoderTest.java | 12 +-
pom.xml | 4 +-
27 files changed, 478 insertions(+), 108 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
index 532e023..3674a57 100644
--- a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
+++ b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
@@ -17,17 +17,6 @@
package org.apache.mahout.classifier;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.text.SimpleDateFormat;
-import java.util.Collection;
-import java.util.Date;
-import java.util.Locale;
-import java.util.Random;
-
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.io.Closeables;
@@ -37,6 +26,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
@@ -44,6 +34,17 @@ import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.SimpleDateFormat;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Locale;
+import java.util.Random;
+
public final class NewsgroupHelper {
private static final SimpleDateFormat[] DATE_FORMATS = {
@@ -59,7 +60,7 @@ public final class NewsgroupHelper {
private static final long WEEK = 7 * 24 * 3600;
private final Random rand = RandomUtils.getRandom();
- private final Analyzer analyzer = new StandardAnalyzer();
+ private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/pom.xml
----------------------------------------------------------------------
diff --git a/integration/pom.xml b/integration/pom.xml
index 8960417..fcb85cb 100644
--- a/integration/pom.xml
+++ b/integration/pom.xml
@@ -137,17 +137,6 @@
<artifactId>lucene-analyzers-common</artifactId>
<optional>true</optional>
</dependency>
- <dependency>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-core</artifactId>
- <version>${lucene.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <version>3.1</version>
- </dependency>
-
<dependency>
<groupId>org.mongodb</groupId>
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
index f1a360e..1c4f8de 100644
--- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
+++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputFormat.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
@@ -31,7 +32,6 @@ import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfos;
-import org.apache.solr.store.hdfs.HdfsDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -52,7 +52,8 @@ public class LuceneSegmentInputFormat extends InputFormat {
List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths();
for (Path indexPath : indexPaths) {
- HdfsDirectory directory = new HdfsDirectory(indexPath, configuration);
+ ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath,
+ false, configuration);
SegmentInfos segmentInfos = new SegmentInfos();
segmentInfos.read(directory);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
index 1d94416..1441e32 100644
--- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
+++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentInputSplit.java
@@ -16,18 +16,18 @@
*/
package org.apache.mahout.text;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfos;
-import org.apache.solr.store.hdfs.HdfsDirectory;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
/**
* {@link InputSplit} implementation that represents a Lucene segment.
@@ -88,7 +88,9 @@ public class LuceneSegmentInputSplit extends InputSplit implements Writable {
* @throws IOException if an error occurs when accessing the directory
*/
public SegmentCommitInfo getSegment(Configuration configuration) throws IOException {
- HdfsDirectory directory = new HdfsDirectory(indexPath, configuration);
+ ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory(FileSystem.get(configuration), indexPath,
+ false, configuration);
+
SegmentInfos segmentInfos = new SegmentInfos();
segmentInfos.read(directory);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
index fe6a407..485e856 100644
--- a/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
+++ b/integration/src/main/java/org/apache/mahout/text/LuceneSegmentRecordReader.java
@@ -62,8 +62,9 @@ public class LuceneSegmentRecordReader extends RecordReader<Text, NullWritable>
for (String field : lucene2SeqConfiguration.getFields()) {
LuceneIndexHelper.fieldShouldExistInIndex(segmentReader, field);
}
+
Weight weight = lucene2SeqConfiguration.getQuery().createWeight(searcher);
- scorer = weight.scorer(segmentReader.getContext(), segmentReader.getLiveDocs());
+ scorer = weight.scorer(segmentReader.getContext(), false, false, null);
if (scorer == null) {
throw new IllegalArgumentException("Could not create query scorer for query: "
+ lucene2SeqConfiguration.getQuery());
@@ -73,6 +74,7 @@ public class LuceneSegmentRecordReader extends RecordReader<Text, NullWritable>
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
nextDocId = scorer.nextDoc();
+
return nextDocId != Scorer.NO_MORE_DOCS;
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
index f7a077b..b36f3e9 100644
--- a/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
+++ b/integration/src/main/java/org/apache/mahout/text/LuceneStorageConfiguration.java
@@ -40,12 +40,13 @@ import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import static org.apache.lucene.util.Version.LUCENE_46;
+
/**
* Holds all the configuration for {@link SequenceFilesFromLuceneStorage}, which generates a sequence file
* with id as the key and a content field as value.
@@ -212,7 +213,7 @@ public class LuceneStorageConfiguration implements Writable {
}
idField = in.readUTF();
fields = Arrays.asList(in.readUTF().split(SEPARATOR_FIELDS));
- query = new QueryParser(Version.LUCENE_4_10_4, "query", new StandardAnalyzer()).parse(in.readUTF());
+ query = new QueryParser(LUCENE_46, "query", new StandardAnalyzer(LUCENE_46)).parse(in.readUTF());
maxHits = in.readInt();
} catch (ParseException e) {
throw new RuntimeException("Could not deserialize " + this.getClass().getName(), e);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
index 9560142..8776c5f 100644
--- a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
+++ b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
/**
* Custom Lucene Analyzer designed for aggressive feature reduction
@@ -41,12 +42,13 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
* stop words, excluding non-alpha-numeric tokens, and porter stemming.
*/
public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
-
+ private static final Version LUCENE_VERSION = Version.LUCENE_46;
+
// extended set of stop words composed of common mail terms like "hi",
// HTML tags, and Java keywords asmany of the messages in the archives
// are subversion check-in notifications
- private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList(
+ private static final CharArraySet STOP_SET = new CharArraySet(LUCENE_VERSION, Arrays.asList(
"3d","7bit","a0","about","above","abstract","across","additional","after",
"afterwards","again","against","align","all","almost","alone","along",
"already","also","although","always","am","among","amongst","amoungst",
@@ -106,17 +108,22 @@ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
private static final Matcher MATCHER = ALPHA_NUMERIC.matcher("");
public MailArchivesClusteringAnalyzer() {
- super(STOP_SET);
+ super(LUCENE_VERSION, STOP_SET);
}
+ public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
+ super(LUCENE_VERSION, stopSet);
+
+ }
+
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new StandardTokenizer(reader);
- TokenStream result = new StandardFilter(tokenizer);
- result = new LowerCaseFilter(result);
+ Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader);
+ TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer);
+ result = new LowerCaseFilter(LUCENE_VERSION, result);
result = new ASCIIFoldingFilter(result);
result = new AlphaNumericMaxLengthFilter(result);
- result = new StopFilter(result, STOP_SET);
+ result = new StopFilter(LUCENE_VERSION, result, STOP_SET);
result = new PorterStemFilter(result);
return new TokenStreamComponents(tokenizer, result);
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
new file mode 100644
index 0000000..e97e35b
--- /dev/null
+++ b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java
@@ -0,0 +1,354 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.text;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.lucene.store.BaseDirectory;
+import org.apache.lucene.store.BufferedIndexInput;
+import org.apache.lucene.store.BufferedIndexOutput;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.Lock;
+import org.apache.lucene.store.LockFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collection;
+
+//TODO: is there a better way of doing this in Lucene 4.x?
+
+/**
+ * This class implements a read-only Lucene Directory on top of a general FileSystem.
+ * Currently it does not support locking.
+ * <p/>
+ * // TODO: Rename to FileSystemReadOnlyDirectory
+ */
+public class ReadOnlyFileSystemDirectory extends BaseDirectory {
+
+ private final FileSystem fs;
+ private final Path directory;
+ private final int ioFileBufferSize;
+
+ private static final Logger log = LoggerFactory.getLogger(ReadOnlyFileSystemDirectory.class);
+
+ /**
+ * Constructor
+ *
+ * @param fs - filesystem
+ * @param directory - directory path
+ * @param create - if true create the directory
+ * @param conf - MR Job Configuration
+ * @throws IOException
+ */
+
+ public ReadOnlyFileSystemDirectory(FileSystem fs, Path directory, boolean create,
+ Configuration conf) throws IOException {
+
+ this.fs = fs;
+ this.directory = directory;
+ this.ioFileBufferSize = conf.getInt("io.file.buffer.size", 4096);
+
+ if (create) {
+ create();
+ }
+
+ boolean isDir = false;
+ try {
+ FileStatus status = fs.getFileStatus(directory);
+ if (status != null) {
+ isDir = status.isDir();
+ }
+ } catch (IOException e) {
+ log.error(e.getMessage(), e);
+ }
+ if (!isDir) {
+ throw new IOException(directory + " is not a directory");
+ }
+ }
+
+
+ private void create() throws IOException {
+ if (!fs.exists(directory)) {
+ fs.mkdirs(directory);
+ }
+
+ boolean isDir = false;
+ try {
+ FileStatus status = fs.getFileStatus(directory);
+ if (status != null) {
+ isDir = status.isDir();
+ }
+ } catch (IOException e) {
+ log.error(e.getMessage(), e);
+ }
+ if (!isDir) {
+ throw new IOException(directory + " is not a directory");
+ }
+
+ // clear old index files
+ FileStatus[] fileStatus =
+ fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
+ for (FileStatus status : fileStatus) {
+ if (!fs.delete(status.getPath(), true)) {
+ throw new IOException("Cannot delete index file "
+ + status.getPath());
+ }
+ }
+ }
+
+ public String[] list() throws IOException {
+ FileStatus[] fileStatus =
+ fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
+ String[] result = new String[fileStatus.length];
+ for (int i = 0; i < fileStatus.length; i++) {
+ result[i] = fileStatus[i].getPath().getName();
+ }
+ return result;
+ }
+
+ @Override
+ public String[] listAll() throws IOException {
+ return list();
+ }
+
+ @Override
+ public boolean fileExists(String name) throws IOException {
+ return fs.exists(new Path(directory, name));
+ }
+
+ @Override
+ public long fileLength(String name) throws IOException {
+ return fs.getFileStatus(new Path(directory, name)).getLen();
+ }
+
+ @Override
+ public void deleteFile(String name) throws IOException {
+ if (!fs.delete(new Path(directory, name), true)) {
+ throw new IOException("Cannot delete index file " + name);
+ }
+ }
+
+ @Override
+ public IndexOutput createOutput(String name, IOContext context) throws IOException {
+ //TODO: What should we be doing with the IOContext here, if anything?
+ Path file = new Path(directory, name);
+ if (fs.exists(file) && !fs.delete(file, true)) {
+ // delete the existing one if applicable
+ throw new IOException("Cannot overwrite index file " + file);
+ }
+
+ return new FileSystemIndexOutput(file, ioFileBufferSize);
+ }
+
+ @Override
+ public void sync(Collection<String> names) throws IOException {
+ // do nothing, as this is read-only
+ }
+
+ @Override
+ public IndexInput openInput(String name, IOContext context) throws IOException {
+ return new FileSystemIndexInput(new Path(directory, name), ioFileBufferSize);
+ }
+
+ @Override
+ public Lock makeLock(final String name) {
+ return new Lock() {
+ public boolean obtain() {
+ return true;
+ }
+
+ public void release() {
+ }
+
+ public boolean isLocked() {
+ throw new UnsupportedOperationException();
+ }
+
+ public String toString() {
+ return "Lock@" + new Path(directory, name);
+ }
+ };
+ }
+
+ @Override
+ public void clearLock(String name) throws IOException {
+ // do nothing
+ }
+
+ @Override
+ public void close() throws IOException {
+ // do not close the file system
+ }
+
+ @Override
+ public void setLockFactory(LockFactory lockFactory) throws IOException {
+ // do nothing
+ }
+
+ @Override
+ public LockFactory getLockFactory() {
+ return null;
+ }
+
+ @Override
+ public String toString() {
+ return this.getClass().getName() + "@" + directory;
+ }
+
+ private class FileSystemIndexInput extends BufferedIndexInput implements Cloneable {
+
+ // shared by clones
+ private class Descriptor {
+ public final FSDataInputStream in;
+ public long position; // cache of in.getPos()
+
+ public Descriptor(Path file, int ioFileBufferSize) throws IOException {
+ this.in = fs.open(file, ioFileBufferSize);
+ }
+ }
+
+ private final Path filePath; // for debugging
+ private final Descriptor descriptor;
+ private final long length;
+ private boolean isOpen;
+ private boolean isClone;
+
+ public FileSystemIndexInput(Path path, int ioFileBufferSize)
+ throws IOException {
+ super("FSII_" + path.getName(), ioFileBufferSize);
+ filePath = path;
+ descriptor = new Descriptor(path, ioFileBufferSize);
+ length = fs.getFileStatus(path).getLen();
+ isOpen = true;
+ }
+
+ @Override
+ protected void readInternal(byte[] b, int offset, int len)
+ throws IOException {
+ long position = getFilePointer();
+ if (position != descriptor.position) {
+ descriptor.in.seek(position);
+ descriptor.position = position;
+ }
+ int total = 0;
+ do {
+ int i = descriptor.in.read(b, offset + total, len - total);
+ if (i == -1) {
+ throw new IOException("Read past EOF");
+ }
+ descriptor.position += i;
+ total += i;
+ } while (total < len);
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (!isClone) {
+ if (isOpen) {
+ descriptor.in.close();
+ isOpen = false;
+ } else {
+ throw new IOException("Index file " + filePath + " already closed");
+ }
+ }
+ }
+
+ @Override
+ protected void seekInternal(long position) {
+ // handled in readInternal()
+ }
+
+ @Override
+ public long length() {
+ return length;
+ }
+
+ @Override
+ protected void finalize() throws Throwable {
+ super.finalize();
+ if (!isClone && isOpen) {
+ close(); // close the file
+ }
+ }
+
+ @Override
+ public BufferedIndexInput clone() {
+ FileSystemIndexInput clone = (FileSystemIndexInput) super.clone();
+ clone.isClone = true;
+ return clone;
+ }
+ }
+
+ private class FileSystemIndexOutput extends BufferedIndexOutput {
+
+ private final Path filePath; // for debugging
+ private final FSDataOutputStream out;
+ private boolean isOpen;
+
+ public FileSystemIndexOutput(Path path, int ioFileBufferSize)
+ throws IOException {
+ filePath = path;
+ // overwrite is true by default
+ out = fs.create(path, true, ioFileBufferSize);
+ isOpen = true;
+ }
+
+ @Override
+ public void flushBuffer(byte[] b, int offset, int size) throws IOException {
+ out.write(b, offset, size);
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (isOpen) {
+ super.close();
+ out.close();
+ isOpen = false;
+ } else {
+ throw new IOException("Index file " + filePath + " already closed");
+ }
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long length() throws IOException {
+ return out.getPos();
+ }
+
+ @Override
+ protected void finalize() throws Throwable {
+ super.finalize();
+ if (isOpen) {
+ close(); // close the file
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
index 41c3f84..b7fd495 100644
--- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
+++ b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorage.java
@@ -22,7 +22,6 @@ import java.util.List;
import com.google.common.base.Strings;
import com.google.common.io.Closeables;
-import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -41,6 +40,8 @@ import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static org.apache.commons.lang.StringUtils.isBlank;
+
/**
* Generates a sequence file from a Lucene index with a specified id field as the key and a content field as the value.
* Configure this class with a {@link LuceneStorageConfiguration} bean.
@@ -81,6 +82,7 @@ public class SequenceFilesFromLuceneStorage {
processedDocs = writerCollector.processedDocs;
Closeables.close(sequenceFileWriter, false);
directory.close();
+ //searcher.close();
reader.close();
}
}
@@ -115,7 +117,7 @@ public class SequenceFilesFromLuceneStorage {
Text theValue = new Text();
LuceneSeqFileHelper.populateValues(doc, theValue, fields);
//if they are both empty, don't write
- if (StringUtils.isBlank(theKey.toString()) && StringUtils.isBlank(theValue.toString())) {
+ if (isBlank(theKey.toString()) && isBlank(theValue.toString())) {
return;
}
sequenceFileWriter.append(theKey, theValue);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
index cd019ac..1bd3f3e 100644
--- a/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
+++ b/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromLuceneStorageDriver.java
@@ -30,6 +30,7 @@ import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
+import org.apache.lucene.util.Version;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -95,7 +96,8 @@ public class SequenceFilesFromLuceneStorageDriver extends AbstractJob {
if (hasOption(OPTION_QUERY)) {
try {
String queryString = COMPILE.matcher(getOption(OPTION_QUERY)).replaceAll("");
- QueryParser queryParser = new QueryParser(queryString, new StandardAnalyzer());
+ QueryParser queryParser = new QueryParser(Version.LUCENE_46, queryString,
+ new StandardAnalyzer(Version.LUCENE_46));
query = queryParser.parse(queryString);
} catch (ParseException e) {
throw new IllegalArgumentException(e.getMessage(), e);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
index 4ad0367..ad55ba7 100644
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
+++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
@@ -28,24 +28,25 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
+import org.apache.lucene.util.Version;
public class WikipediaAnalyzer extends StopwordAnalyzerBase {
public WikipediaAnalyzer() {
- super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+ super(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public WikipediaAnalyzer(CharArraySet stopSet) {
- super(stopSet);
+ super(Version.LUCENE_46, stopSet);
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WikipediaTokenizer(reader);
- TokenStream result = new StandardFilter(tokenizer);
- result = new LowerCaseFilter(result);
- result = new StopFilter(result, getStopwordSet());
+ TokenStream result = new StandardFilter(Version.LUCENE_46, tokenizer);
+ result = new LowerCaseFilter(Version.LUCENE_46, result);
+ result = new StopFilter(Version.LUCENE_46, result, getStopwordSet());
return new TokenStreamComponents(tokenizer, result);
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
index 5b7f0af..36b166a 100644
--- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
+++ b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
import org.apache.mahout.common.lucene.TokenStreamIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -36,7 +37,7 @@ public class AnalyzerTransformer implements RegexTransformer {
private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class);
public AnalyzerTransformer() {
- this(new StandardAnalyzer());
+ this(new StandardAnalyzer(Version.LUCENE_46), "text");
}
public AnalyzerTransformer(Analyzer analyzer) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/Version.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/Version.java b/mr/src/main/java/org/apache/mahout/Version.java
index aac359c..5f3c879 100644
--- a/mr/src/main/java/org/apache/mahout/Version.java
+++ b/mr/src/main/java/org/apache/mahout/Version.java
@@ -17,10 +17,10 @@
package org.apache.mahout;
-import java.io.IOException;
-
+import com.google.common.base.Charsets;
import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
+
+import java.io.IOException;
public final class Version {
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
index e91423d..37ca383 100644
--- a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
+++ b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
@@ -32,7 +32,7 @@ public final class AnalyzerUtils {
* @throws ClassNotFoundException - {@link ClassNotFoundException}
*/
public static Analyzer createAnalyzer(String analyzerClassName) throws ClassNotFoundException {
- return createAnalyzer(analyzerClassName, Version.LUCENE_4_10_4);
+ return createAnalyzer(analyzerClassName, Version.LUCENE_46);
}
public static Analyzer createAnalyzer(String analyzerClassName, Version version) throws ClassNotFoundException {
@@ -47,7 +47,7 @@ public final class AnalyzerUtils {
* @return {@link Analyzer}
*/
public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass) {
- return createAnalyzer(analyzerClass, Version.LUCENE_4_10_4);
+ return createAnalyzer(analyzerClass, Version.LUCENE_46);
}
public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass, Version version) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java b/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
index 6765c80..8a1f8f8 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
@@ -18,11 +18,8 @@
package org.apache.mahout.vectorizer;
import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
@@ -57,6 +54,10 @@ import org.apache.mahout.vectorizer.term.TermCountReducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
/**
* This class converts a set of input documents in the sequence file format to vectors. The Sequence file
* input should have a {@link Text} key containing the unique document identifier and a {@link StringTuple}
@@ -191,7 +192,7 @@ public final class DictionaryVectorizer extends AbstractJob implements Vectorize
}
int partialVectorIndex = 0;
- Collection<Path> partialVectorPaths = new ArrayList<>();
+ Collection<Path> partialVectorPaths = Lists.newArrayList();
for (Path dictionaryChunk : dictionaryChunks) {
Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
@@ -216,7 +217,7 @@ public final class DictionaryVectorizer extends AbstractJob implements Vectorize
Configuration baseConf,
int chunkSizeInMegabytes,
int[] maxTermDimension) throws IOException {
- List<Path> chunkPaths = new ArrayList<>();
+ List<Path> chunkPaths = Lists.newArrayList();
Configuration conf = new Configuration(baseConf);
@@ -274,7 +275,6 @@ public final class DictionaryVectorizer extends AbstractJob implements Vectorize
* @param output
* output directory were the partial vectors have to be created
* @param dimension
- * number of features
* @param sequentialAccess
* output vectors should be optimized for sequential access
* @param namedVectors
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java b/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
index 8f57e3a..2c3c236 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
@@ -38,7 +38,7 @@ import org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper;
* containing the unique document identifier and a
* {@link Text} value containing the whole document. The document should be stored in UTF-8 encoding which is
* recognizable by hadoop. It uses the given {@link Analyzer} to process the document into
- * {@link org.apache.lucene.analysis.TokenStream}s.
+ * {@link org.apache.lucene.analysis.Token}s.
*
*/
public final class DocumentProcessor {
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
index 6a746ce..04b718e 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
@@ -17,9 +17,9 @@
package org.apache.mahout.vectorizer.encoders;
+import com.google.common.base.Charsets;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
-import org.apache.commons.io.Charsets;
import org.apache.mahout.math.Vector;
/**
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
index a0349c0..14382a5 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
@@ -17,7 +17,7 @@
package org.apache.mahout.vectorizer.encoders;
-import org.apache.commons.io.Charsets;
+import com.google.common.base.Charsets;
import org.apache.mahout.math.Vector;
/**
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
index 60c89f7..2ea9b1b 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/Dictionary.java
@@ -17,8 +17,9 @@
package org.apache.mahout.vectorizer.encoders;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
import java.util.List;
import java.util.Map;
@@ -26,7 +27,7 @@ import java.util.Map;
* Assigns integer codes to strings as they appear.
*/
public class Dictionary {
- private final Map<String, Integer> dict = new LinkedHashMap<>();
+ private final Map<String, Integer> dict = Maps.newLinkedHashMap();
public int intern(String s) {
if (!dict.containsKey(s)) {
@@ -37,7 +38,7 @@ public class Dictionary {
public List<String> values() {
// order of keySet is guaranteed to be insertion order
- return new ArrayList<>(dict.keySet());
+ return Lists.newArrayList(dict.keySet());
}
public int size() {
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
index 12f1848..96498d7 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/FeatureVectorEncoder.java
@@ -17,15 +17,15 @@
package org.apache.mahout.vectorizer.encoders;
+import com.google.common.base.Charsets;
+import com.google.common.collect.Sets;
+import org.apache.mahout.math.MurmurHash;
+import org.apache.mahout.math.Vector;
+
import java.util.Collections;
-import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.MurmurHash;
-import org.apache.mahout.math.Vector;
-
/**
* General interface for objects that record features into a feature vector.
* <p/>
@@ -257,7 +257,7 @@ public abstract class FeatureVectorEncoder {
}
Set<Integer> trace = traceDictionary.get(key);
if (trace == null) {
- trace = new HashSet<>(n);
+ trace = Sets.newHashSet(n);
traceDictionary.put(key, trace);
} else {
trace.add(n);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
index bd6594a..0be8823 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/InteractionValueEncoder.java
@@ -19,9 +19,10 @@ package org.apache.mahout.vectorizer.encoders;
import java.util.Locale;
-import org.apache.commons.io.Charsets;
import org.apache.mahout.math.Vector;
+import com.google.common.base.Charsets;
+
public class InteractionValueEncoder extends FeatureVectorEncoder {
private final FeatureVectorEncoder firstEncoder;
private final FeatureVectorEncoder secondEncoder;
@@ -87,7 +88,7 @@ public class InteractionValueEncoder extends FeatureVectorEncoder {
int n = (k + j) % data.size();
if (isTraceEnabled()) {
trace(String.format("%s:%s", new String(originalForm1, Charsets.UTF_8), new String(originalForm2,
- Charsets.UTF_8)), n);
+ Charsets.UTF_8)), n);
}
data.set(n, data.get(n) + w);
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
index 826f669..6f67ef4 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/StaticWordValueEncoder.java
@@ -17,11 +17,11 @@
package org.apache.mahout.vectorizer.encoders;
+import com.google.common.base.Charsets;
+
import java.util.Collections;
import java.util.Map;
-import org.apache.commons.io.Charsets;
-
/**
* Encodes a categorical values with an unbounded vocabulary. Values are encoding by incrementing a
* few locations in the output vector with a weight that is either defaulted to 1 or that is looked
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
index 943fbc8..87de095 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
@@ -17,16 +17,16 @@
package org.apache.mahout.vectorizer.encoders;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.regex.Pattern;
-
+import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
-import org.apache.commons.io.Charsets;
import org.apache.mahout.math.Vector;
+import java.util.Collection;
+import java.util.regex.Pattern;
+
/**
* Encodes text that is tokenized on non-alphanum separators. Each word is encoded using a
* settable encoder which is by default an StaticWordValueEncoder which gives all
@@ -98,7 +98,7 @@ public class TextValueEncoder extends FeatureVectorEncoder {
@Override
protected Iterable<Integer> hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) {
- Collection<Integer> hashes = new ArrayList<>();
+ Collection<Integer> hashes = Lists.newArrayList();
for (String word : tokenize(new String(originalForm, Charsets.UTF_8))) {
hashes.add(hashForProbe(bytesForString(word), dataSize, name, probe));
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java b/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
index c878946..1496c90 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
@@ -17,12 +17,8 @@
package org.apache.mahout.vectorizer.term;
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
@@ -46,6 +42,11 @@ import org.apache.mahout.math.map.OpenObjectIntHashMap;
import org.apache.mahout.vectorizer.DictionaryVectorizer;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Iterator;
+import java.util.List;
+
/**
* Converts a document in to a sparse vector
*/
@@ -67,7 +68,7 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec
return;
}
- List<String> value = new ArrayList<>();
+ List<String> value = Lists.newArrayList();
while (it.hasNext()) {
value.addAll(it.next().getEntries());
@@ -76,8 +77,9 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec
Vector vector = new RandomAccessSparseVector(dimension, value.size()); // guess at initial size
if (maxNGramSize >= 2) {
- try (ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.iterator()), maxNGramSize)){
- sf.reset();
+ ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.iterator()), maxNGramSize);
+ sf.reset();
+ try {
do {
String term = sf.getAttribute(CharTermAttribute.class).toString();
if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
@@ -85,7 +87,10 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec
vector.setQuick(termId, vector.getQuick(termId) + 1);
}
} while (sf.incrementToken());
+
sf.end();
+ } finally {
+ Closeables.close(sf, true);
}
} else {
for (String term : value) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java b/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
index 3387255..5f9d666 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
@@ -17,11 +17,8 @@
package org.apache.mahout.vectorizer.tfidf;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
@@ -46,6 +43,9 @@ import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.term.TermDocumentCountMapper;
import org.apache.mahout.vectorizer.term.TermDocumentCountReducer;
+import java.io.IOException;
+import java.util.List;
+
/**
* This class converts a set of input vectors with term frequencies to TfIdf vectors. The Sequence file input
* should have a {@link org.apache.hadoop.io.WritableComparable} key containing and a
@@ -118,7 +118,7 @@ public final class TFIDFConverter {
"normPower must be > 1 and not infinite if log normalization is chosen", normPower);
int partialVectorIndex = 0;
- List<Path> partialVectorPaths = new ArrayList<>();
+ List<Path> partialVectorPaths = Lists.newArrayList();
List<Path> dictionaryChunks = datasetFeatures.getSecond();
for (Path dictionaryChunk : dictionaryChunks) {
Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
@@ -195,7 +195,7 @@ public final class TFIDFConverter {
Path dictionaryPathBase,
Configuration baseConf,
int chunkSizeInMegabytes) throws IOException {
- List<Path> chunkPaths = new ArrayList<>();
+ List<Path> chunkPaths = Lists.newArrayList();
Configuration conf = new Configuration(baseConf);
FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
----------------------------------------------------------------------
diff --git a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
index 4cb6946..4446fef 100644
--- a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
+++ b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
@@ -17,15 +17,16 @@
package org.apache.mahout.vectorizer.encoders;
-import java.util.Locale;
-
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.junit.Test;
+import java.util.Locale;
+
public final class TextValueEncoderTest extends MahoutTestCase {
@Test
@@ -40,7 +41,7 @@ public final class TextValueEncoderTest extends MahoutTestCase {
// now some fancy weighting
StaticWordValueEncoder w = new StaticWordValueEncoder("text");
- w.setDictionary(ImmutableMap.of("word1", 3.0, "word2", 1.5));
+ w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5));
enc.setWordEncoder(w);
// should set 6 locations to something
@@ -69,7 +70,7 @@ public final class TextValueEncoderTest extends MahoutTestCase {
@Test
public void testLuceneEncoding() throws Exception {
LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
- enc.setAnalyzer(new WhitespaceAnalyzer());
+ enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46));
Vector v1 = new DenseVector(200);
enc.addToVector("test1 and more", v1);
enc.flush(1, v1);
@@ -87,8 +88,7 @@ public final class TextValueEncoderTest extends MahoutTestCase {
v1 = new DenseVector(200);
StringBuilder builder = new StringBuilder(5000);
- for (int i = 0; i < 1000; i++) {
- //lucene's internal buffer length request is 4096, so let's make sure we can handle larger size
+ for (int i = 0; i < 1000; i++) {//lucene's internal buffer length request is 4096, so let's make sure we can handle larger size
builder.append("token_").append(i).append(' ');
}
enc.addToVector(builder.toString(), v1);
http://git-wip-us.apache.org/repos/asf/mahout/blob/db624ef3/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 253f31b..80b4a20 100644
--- a/pom.xml
+++ b/pom.xml
@@ -115,8 +115,8 @@
<mfindbugs.version>2.5.2</mfindbugs.version>
<mjavadoc.version>2.9.1</mjavadoc.version>
<hbase.version>1.0.0</hbase.version>
- <lucene.version>4.10.4</lucene.version>
- <slf4j.version>1.7.12</slf4j.version>
+ <lucene.version>4.6.1</lucene.version>
+ <slf4j.version>1.7.10</slf4j.version>
<scala.compat.version>2.10</scala.compat.version>
<scala.version>2.10.4</scala.version>
<spark.version>1.1.1</spark.version>
[2/5] mahout git commit: Merge branch 'master' of
github.com:andrewmusselman/mahout
Posted by ak...@apache.org.
Merge branch 'master' of github.com:andrewmusselman/mahout
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/92bdcf8e
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/92bdcf8e
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/92bdcf8e
Branch: refs/heads/master
Commit: 92bdcf8e0f74cf8e4da858156602e658bfbe0c70
Parents: a551b15 db624ef
Author: Andrew Musselman <ak...@apache.org>
Authored: Mon Apr 6 11:11:26 2015 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Mon Apr 6 11:11:26 2015 -0700
----------------------------------------------------------------------
----------------------------------------------------------------------
[4/5] mahout git commit: Merge branch 'master' of
https://git-wip-us.apache.org/repos/asf/mahout
Posted by ak...@apache.org.
Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/mahout
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/27ff9df0
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/27ff9df0
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/27ff9df0
Branch: refs/heads/master
Commit: 27ff9df077fd498edcb5e92946be0d5041f755dd
Parents: f63ee5f 5e41bc0
Author: Andrew Musselman <ak...@apache.org>
Authored: Tue Apr 7 16:45:19 2015 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Tue Apr 7 16:45:19 2015 -0700
----------------------------------------------------------------------
examples/bin/run-item-sim.sh | 38 ++++++++++++++++++++++++++++++++------
1 file changed, 32 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
[3/5] mahout git commit: Merge branch 'master' of
https://git-wip-us.apache.org/repos/asf/mahout
Posted by ak...@apache.org.
Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/mahout
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/f63ee5fd
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/f63ee5fd
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/f63ee5fd
Branch: refs/heads/master
Commit: f63ee5fdde0f35ecd5a851c9122f5195befc9348
Parents: 92bdcf8 670a7d2
Author: Andrew Musselman <ak...@apache.org>
Authored: Mon Apr 6 20:38:48 2015 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Mon Apr 6 20:38:48 2015 -0700
----------------------------------------------------------------------
.../mahout/classifier/NewsgroupHelper.java | 2 +-
integration/pom.xml | 10 +
.../mahout/text/LuceneSegmentInputFormat.java | 4 +-
.../mahout/text/LuceneSegmentInputSplit.java | 4 +-
.../mahout/text/LuceneSegmentRecordReader.java | 3 +-
.../mahout/text/LuceneStorageConfiguration.java | 4 +-
.../text/MailArchivesClusteringAnalyzer.java | 22 +-
.../text/ReadOnlyFileSystemDirectory.java | 354 -------------------
.../text/SequenceFilesFromLuceneStorage.java | 1 -
.../SequenceFilesFromLuceneStorageDriver.java | 3 +-
.../SequenceFilesFromMailArchivesMapper.java | 29 +-
.../text/wikipedia/WikipediaAnalyzer.java | 10 +-
.../mahout/utils/regex/AnalyzerTransformer.java | 2 +-
.../mahout/common/lucene/AnalyzerUtils.java | 4 +-
.../encoders/InteractionValueEncoder.java | 6 +-
.../mahout/classifier/ConfusionMatrixTest.java | 4 +-
.../classifier/df/DecisionForestTest.java | 1 -
.../apache/mahout/classifier/df/data/Utils.java | 10 +-
.../mapreduce/partial/PartialBuilderTest.java | 16 +-
.../classifier/mlp/TestNeuralNetwork.java | 11 +-
.../classifier/naivebayes/NaiveBayesTest.java | 17 +-
.../encoders/TextValueEncoderTest.java | 2 +-
pom.xml | 4 +-
23 files changed, 75 insertions(+), 448 deletions(-)
----------------------------------------------------------------------
[5/5] mahout git commit: MAHOUT-1665: Update hadoop commands in
example scripts (akm) closes apache/mahout#98
Posted by ak...@apache.org.
MAHOUT-1665: Update hadoop commands in example scripts (akm) closes apache/mahout#98
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/daad3a4c
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/daad3a4c
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/daad3a4c
Branch: refs/heads/master
Commit: daad3a4ce618cbd05be468c4ce6e451618f3a028
Parents: 27ff9df
Author: Andrew Musselman <ak...@apache.org>
Authored: Tue Apr 7 16:56:10 2015 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Tue Apr 7 16:56:10 2015 -0700
----------------------------------------------------------------------
CHANGELOG | 2 +
examples/bin/README.txt | 5 ++-
examples/bin/classify-20newsgroups.sh | 25 +++++++------
examples/bin/classify-wikipedia.sh | 22 +++++------
examples/bin/cluster-reuters.sh | 47 ++++++++++++-----------
examples/bin/cluster-syntheticcontrol.sh | 10 +++--
examples/bin/factorize-movielens-1M.sh | 5 ++-
examples/bin/factorize-netflix.sh | 17 +++++----
examples/bin/run-rf.sh | 48 ++++++++++--------------
examples/bin/set-dfs-commands.sh | 54 +++++++++++++++++++++++++++
10 files changed, 147 insertions(+), 88 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 318bfcd..d1a0c4b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.10.0 - unreleased
+ MAHOUT-1665: Update hadoop commands in example scripts (akm)
+
MAHOUT-1676: Deprecate MLP, ConcatenateVectorsJob and ConcatenateVectorsReducer in the codebase (apalumbo)
MAHOUT-1622: MultithreadedBatchItemSimilarities outputs incorrect number of similarities (Jesse Daniels, Anand Avati via smarthi)
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/examples/bin/README.txt b/examples/bin/README.txt
index d3737b3..f47ab44 100644
--- a/examples/bin/README.txt
+++ b/examples/bin/README.txt
@@ -6,5 +6,6 @@ classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 Ne
cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically.
cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically.
factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
-factorize-netflix.sh -- Run the ALS Recommender on the Netflix data set
-run-rf.sh -- Create some synthetic data, build a random forest, and test performance.
\ No newline at end of file
+factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set.
+run-rf.sh -- Create some synthetic data, build a random forest, and test performance.
+spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh
index bc5aec4..b09e996 100755
--- a/examples/bin/classify-20newsgroups.sh
+++ b/examples/bin/classify-20newsgroups.sh
@@ -33,13 +33,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
fi
START_PATH=`pwd`
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="$HADOOP_HOME/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
-fi
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
WORK_DIR=/tmp/mahout-work-${USER}
algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)
@@ -109,10 +104,17 @@ if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapR
if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
echo "Copying 20newsgroups data to HDFS"
set +e
- $HADOOP dfs -rmr ${WORK_DIR}/20news-all
- $HADOOP dfs -mkdir ${WORK_DIR}
+ $DFSRM ${WORK_DIR}/20news-all
+ $DFS -mkdir ${WORK_DIR}
+ $DFS -mkdir ${WORK_DIR}/20news-all
set -e
- $HADOOP dfs -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+ if [ $HVERSION -eq "1" ] ; then
+ echo "Copying 20newsgroups data to Hadoop 1 HDFS"
+ $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+ elif [ $HVERSION -eq "2" ] ; then
+ echo "Copying 20newsgroups data to Hadoop 2 HDFS"
+ $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
+ fi
fi
echo "Creating sequence files from 20newsgroups data"
@@ -183,8 +185,9 @@ elif [ "x$alg" == "xsgd" ]; then
echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"
./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
elif [ "x$alg" == "xclean" ]; then
- rm -rf ${WORK_DIR}
+ rm -rf $WORK_DIR
rm -rf /tmp/news-group.model
+ $DFSRM $WORK_DIR
fi
# Remove the work directory
#
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-wikipedia.sh b/examples/bin/classify-wikipedia.sh
index 359ba70..3ff0e25 100755
--- a/examples/bin/classify-wikipedia.sh
+++ b/examples/bin/classify-wikipedia.sh
@@ -20,7 +20,7 @@
# Downloads a (partial) wikipedia dump, trains and tests a classifier.
#
# To run: change into the mahout directory and type:
-# examples/bin/classify-wiki.sh
+# examples/bin/classify-wikipedia.sh
if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
@@ -39,13 +39,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
fi
START_PATH=`pwd`
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="$HADOOP_HOME/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
-fi
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
WORK_DIR=/tmp/mahout-work-wiki
algorithm=( CBayes BinaryCBayes clean)
@@ -73,7 +68,7 @@ if [ "x$alg" != "xclean" ]; then
# Datasets: uncomment and run "clean" to change dataset
########################################################
########## partial small 42.5M zipped
- #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ # curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
########## partial larger 256M zipped
curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
######### full wikipedia dump: 10G zipped
@@ -111,10 +106,10 @@ if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
echo "Copying wikipedia data to HDFS"
set +e
- $HADOOP dfs -rmr ${WORK_DIR}/wikixml
- $HADOOP dfs -mkdir ${WORK_DIR}
+ $DFSRM ${WORK_DIR}/wikixml
+ $DFS -mkdir ${WORK_DIR}
set -e
- $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+ $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
fi
echo "Creating sequence files from wikiXML"
@@ -188,6 +183,7 @@ if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
fi
elif [ "x$alg" == "xclean" ]; then
- rm -rf ${WORK_DIR}
+ rm -rf $WORK_DIR
+ $DFSRM $WORK_DIR
fi
# Remove the work directory
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-reuters.sh b/examples/bin/cluster-reuters.sh
index 7200140..c32d38f 100755
--- a/examples/bin/cluster-reuters.sh
+++ b/examples/bin/cluster-reuters.sh
@@ -31,6 +31,10 @@ SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
MAHOUT="../../bin/mahout"
@@ -39,34 +43,33 @@ if [ ! -e $MAHOUT ]; then
exit 1
fi
-algorithm=( kmeans fuzzykmeans lda streamingkmeans)
+algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
if [ -n "$1" ]; then
choice=$1
else
echo "Please select a number to choose the corresponding clustering algorithm"
echo "1. ${algorithm[0]} clustering"
- echo "2. ${algorithm[1]} clustering"
+ echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
echo "3. ${algorithm[2]} clustering"
echo "4. ${algorithm[3]} clustering"
+ echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
read -p "Enter your choice : " choice
fi
echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
+clustertype=${algorithm[$choice-1]}
WORK_DIR=/tmp/mahout-work-${USER}
-echo "creating work directory at ${WORK_DIR}"
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="$HADOOP_HOME/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
+if [ "x$clustertype" == "xclean" ]; then
+ rm -rf $WORK_DIR
+ $DFSRM $WORK_DIR
+ exit 1
+else
+ $DFS -mkdir -p $WORK_DIR
+ mkdir -p $WORK_DIR
+ echo "Creating work directory at ${WORK_DIR}"
fi
-
-mkdir -p ${WORK_DIR}
-
if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
if [ ! -e ${WORK_DIR}/reuters-out ]; then
if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
@@ -88,17 +91,19 @@ if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
echo "Extracting..."
tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
fi
-
echo "Extracting Reuters"
$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
echo "Copying Reuters data to Hadoop"
set +e
- $HADOOP dfs -rmr ${WORK_DIR}/reuters-sgm
- $HADOOP dfs -rmr ${WORK_DIR}/reuters-out
+ $DFSRM ${WORK_DIR}/reuters-sgm
+ $DFSRM ${WORK_DIR}/reuters-out
+ $DFS -mkdir ${WORK_DIR}/
+ $DFS -mkdir ${WORK_DIR}/reuters-sgm
+ $DFS -mkdir ${WORK_DIR}/reuters-out
+ $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
+ $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
set -e
- $HADOOP dfs -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
- $HADOOP dfs -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
fi
fi
echo "Converting to Sequence Files from Directory"
@@ -118,7 +123,7 @@ if [ "x$clustertype" == "xkmeans" ]; then
-x 10 -k 20 -ow --clustering \
&& \
$MAHOUT clusterdump \
- -i `hadoop dfs -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk'{print $8}'` \
+ -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
-o ${WORK_DIR}/reuters-kmeans/clusterdump \
-d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
-dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
@@ -191,6 +196,4 @@ elif [ "x$clustertype" == "xstreamingkmeans" ]; then
-o ${WORK_DIR}/reuters-cluster-distance.csv \
&& \
cat ${WORK_DIR}/reuters-cluster-distance.csv
-else
- echo "unknown cluster type: $clustertype"
-fi
+fi
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-syntheticcontrol.sh b/examples/bin/cluster-syntheticcontrol.sh
index 3f1229e..eab62be 100755
--- a/examples/bin/cluster-syntheticcontrol.sh
+++ b/examples/bin/cluster-syntheticcontrol.sh
@@ -45,6 +45,8 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
fi
START_PATH=`pwd`
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
WORK_DIR=/tmp/mahout-work-${USER}
@@ -64,13 +66,13 @@ if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
fi
if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
echo "Checking the health of DFS..."
- $HADOOP_HOME/bin/hadoop fs -ls
+ $DFS -ls
if [ $? -eq 0 ];then
echo "DFS is healthy... "
echo "Uploading Synthetic control data to HDFS"
- $HADOOP_HOME/bin/hadoop fs -rmr testdata
- $HADOOP_HOME/bin/hadoop fs -mkdir testdata
- $HADOOP_HOME/bin/hadoop fs -put ${WORK_DIR}/synthetic_control.data testdata
+ $DFSRM testdata
+ $DFS -mkdir testdata
+ $DFS -put ${WORK_DIR}/synthetic_control.data testdata
echo "Successfully Uploaded Synthetic control data to HDFS "
../../bin/mahout org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/examples/bin/factorize-movielens-1M.sh b/examples/bin/factorize-movielens-1M.sh
old mode 100644
new mode 100755
index 8c6aa68..735e425
--- a/examples/bin/factorize-movielens-1M.sh
+++ b/examples/bin/factorize-movielens-1M.sh
@@ -22,6 +22,8 @@
# from http://www.grouplens.org/node/73
#
# To run: change into the mahout directory and type:
+# export MAHOUT_LOCAL=true
+# Then:
# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
@@ -38,7 +40,8 @@ then
exit -1
fi
-MAHOUT="../../bin/mahout"
+export MAHOUT_LOCAL=true
+MAHOUT="$MAHOUT_HOME/bin/mahout"
WORK_DIR=/tmp/mahout-work-${USER}
echo "creating work directory at ${WORK_DIR}"
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/examples/bin/factorize-netflix.sh b/examples/bin/factorize-netflix.sh
old mode 100644
new mode 100755
index f0917ed..856f775
--- a/examples/bin/factorize-netflix.sh
+++ b/examples/bin/factorize-netflix.sh
@@ -28,6 +28,9 @@
# To run:
# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt
+echo "Note this script has been deprecated due to the lack of access to the Netflix data set."
+exit 1
+
if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
echo "This script runs the ALS Recommender on the Netflix data set."
echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
@@ -44,6 +47,11 @@ MAHOUT="../../bin/mahout"
WORK_DIR=/tmp/mahout-work-${USER}
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
echo "Preparing data..."
$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR}
@@ -56,19 +64,14 @@ $MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output
--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="$HADOOP_HOME/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
# print the error, should be around 0.923
echo -e "\nRMSE is:\n"
- $HADOOP fs -tail ${WORK_DIR}/als/rmse/rmse.txt
+ $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
echo -e "\n"
echo "removing work directory"
set +e
- $HADOOP fs -rmr ${WORK_DIR}
+ $DFSRM ${WORK_DIR}
else
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/run-rf.sh
----------------------------------------------------------------------
diff --git a/examples/bin/run-rf.sh b/examples/bin/run-rf.sh
index 17b13b9..ac4c734 100755
--- a/examples/bin/run-rf.sh
+++ b/examples/bin/run-rf.sh
@@ -24,66 +24,58 @@
#
# To run: change into the mahout directory and type:
# ./examples/bin/run-rf.sh <num-rows>
-WORK_DIR=/tmp/mahout-work-${USER}/
-input="rf-input.csv"
+WORK_DIR=/tmp/mahout-work-${USER}
+INPUT="${WORK_DIR}/input"
+mkdir -p $INPUT
+INPUT_PATH="${INPUT}/rf-input.csv"
-# Remove old files
-echo
-echo "Removing old temp files if they exist; this will mention they're not there if not."
-echo
-$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR forest
-$HADOOP_HOME/bin/hadoop fs -mkdir $WORK_DIR
+# Set commands for dfs
+source ./examples/bin/set-dfs-commands.sh
# Create test data
numrows=$1
-echo
-echo "Writing random data to $input"
-./examples/bin/create-rf-data.sh $numrows $input
+echo "Writing random data to $INPUT_PATH"
+./examples/bin/create-rf-data.sh $numrows $INPUT_PATH
# Put the test file in HDFS
-$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash ${WORK_DIR}
-$HADOOP_HOME/bin/hadoop fs -mkdir -p ${WORK_DIR}/input
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="$HADOOP_HOME/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
-fi
if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
echo "Copying random data to HDFS"
set +e
- $HADOOP dfs -rmr ${WORK_DIR}
+ $DFSRM $WORK_DIR
+ $DFS -mkdir -p $INPUT
set -e
- $HADOOP dfs -put $input ${WORK_DIR}/input/$input
+ $DFS -put $INPUT_PATH $INPUT
fi
# Split original file into train and test
echo "Creating training and holdout set with a random 60-40 split of the generated vector dataset"
./bin/mahout split \
- -i ${WORK_DIR}/input \
+ -i $INPUT \
--trainingOutput ${WORK_DIR}/train.csv \
--testOutput ${WORK_DIR}/test.csv \
--randomSelectionPct 40 --overwrite -xm sequential
# Describe input file schema
# Note: "-d 4 N L" indicates four numerical fields and one label, as built by the step above.
-./bin/mahout describe -p $WORK_DIR/input/$input -f $WORK_DIR/info -d 4 N L
+./bin/mahout describe -p $INPUT_PATH -f ${WORK_DIR}/info -d 4 N L
# Train rf model
echo
echo "Training random forest."
echo
-./bin/mahout buildforest -DXmx10000m -Dmapred.max.split.size=1000000 -d $WORK_DIR/train.csv -ds $WORK_DIR/info -sl 7 -p -t 500 -o $WORK_DIR/forest
+./bin/mahout buildforest -DXmx10000m -Dmapred.max.split.size=1000000 -d ${WORK_DIR}/train.csv -ds ${WORK_DIR}/info -sl 7 -p -t 500 -o ${WORK_DIR}/forest
# Test predictions
echo
echo "Testing predictions on test set."
echo
-./bin/mahout testforest -DXmx10000m -Dmapred.output.compress=false -i $WORK_DIR/test.csv -ds $WORK_DIR/info -m $WORK_DIR/forest -a -mr -o $WORK_DIR/predictions
+./bin/mahout testforest -DXmx10000m -Dmapred.output.compress=false -i ${WORK_DIR}/test.csv -ds ${WORK_DIR}/info -m ${WORK_DIR}/forest -a -mr -o ${WORK_DIR}/predictions
# Remove old files
-$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR
-rm $input
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]
+then
+ $DFSRM $WORK_DIR
+fi
+rm -r $WORK_DIR
http://git-wip-us.apache.org/repos/asf/mahout/blob/daad3a4c/examples/bin/set-dfs-commands.sh
----------------------------------------------------------------------
diff --git a/examples/bin/set-dfs-commands.sh b/examples/bin/set-dfs-commands.sh
new file mode 100755
index 0000000..0ee5fe1
--- /dev/null
+++ b/examples/bin/set-dfs-commands.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Requires $HADOOP_HOME to be set.
+#
+# Figures out the major version of Hadoop we're using and sets commands
+# for dfs commands
+#
+# Run by each example script.
+
+# Find a hadoop shell
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ HADOOP="${HADOOP_HOME}/bin/hadoop"
+ if [ ! -e $HADOOP ]; then
+ echo "Can't find hadoop in $HADOOP, exiting"
+ exit 1
+ fi
+fi
+
+# Check Hadoop version
+v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | cut -f 2 -d ' ' | cut -f 1 -d '.'`
+
+if [ $v -eq "1" -o $v -eq "0" ]
+then
+ echo "Discovered Hadoop v0 or v1."
+ export DFS="${HADOOP_HOME}/bin/hadoop dfs"
+ export DFSRM="$DFS -rmr -skipTrash"
+elif [ $v -eq "2" ]
+then
+ echo "Discovered Hadoop v2."
+ export DFS="${HADOOP_HOME}/bin/hdfs dfs"
+ export DFSRM="$DFS -rm -r -skipTrash"
+else
+ echo "Can't determine Hadoop version."
+ exit 1
+fi
+echo "Setting dfs command to $DFS, dfs rm to $DFSRM."
+
+export HVERSION=$v