You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 14:51:53 UTC
[25/51] [partial] mahout git commit: MAHOUT-2042 and MAHOUT-2045
Delete directories which were moved/no longer in use
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
deleted file mode 100644
index e06e8d6..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/NumericFieldDocument.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.IntField;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-
-/**
- * Document with numeric field.
- */
-@Deprecated
-public class NumericFieldDocument extends SingleFieldDocument {
-
- public static final String NUMERIC_FIELD = "numeric";
-
- private int numericField;
-
- public NumericFieldDocument(String id, String field, int numericField) {
- super(id, field);
- this.numericField = numericField;
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = new Document();
-
- document.add(new StringField(ID_FIELD, getId(), Field.Store.YES));
- document.add(new TextField(FIELD, getField(), Field.Store.YES));
- document.add(new IntField(NUMERIC_FIELD, numericField, Field.Store.YES));
-
- return document;
- }
-
- public int getNumericField() {
- return numericField;
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
deleted file mode 100644
index 4636a51..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/SingleFieldDocument.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-
-/**
- * Used for testing lucene2seq
- */
-@Deprecated
-public class SingleFieldDocument implements TestDocument {
-
- public static final String ID_FIELD = "idField";
- public static final String FIELD = "field";
-
- private String id;
- private String field;
-
- public SingleFieldDocument(String id, String field) {
- this.id = id;
- this.field = field;
- }
-
- @Override
- public String getId() {
- return id;
- }
-
- @Override
- public String getField() {
- return field;
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = new Document();
-
- Field idField = new StringField(ID_FIELD, getId(), Field.Store.YES);
- Field field = new TextField(FIELD, getField(), Field.Store.YES);
-
- document.add(idField);
- document.add(field);
-
- return document;
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
deleted file mode 100644
index 7243c71..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/TestDocument.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-@Deprecated
-public interface TestDocument {
-
- String getId();
-
- String getField();
-
- Document asLuceneDocument();
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
deleted file mode 100644
index 6eb43f6..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/UnstoredFieldsDocument.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-
-/**
- * Used for testing lucene2seq
- */
-@Deprecated
-public class UnstoredFieldsDocument extends SingleFieldDocument {
-
- public static final String UNSTORED_FIELD = "unstored";
-
- public UnstoredFieldsDocument(String id, String field) {
- super(id, field);
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = super.asLuceneDocument();
-
- document.add(new StringField(UNSTORED_FIELD, "", Field.Store.NO));
-
- return document;
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java b/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
deleted file mode 100644
index 65b308f..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/Bump125Test.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import com.google.common.collect.Lists;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.Iterator;
-
-public class Bump125Test extends MahoutTestCase {
- @Test
- public void testIncrement() throws Exception {
- Iterator<Integer> ref = Lists.newArrayList(1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 50, 60,
- 70, 80, 100, 120, 140, 160, 180, 200, 250, 300, 350,
- 400, 500, 600, 700, 800, 1000, 1200, 1400, 1600, 1800,
- 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000)
- .iterator();
- Bump125 b = new Bump125();
- for (int i = 0; i < 50; i++) {
- long x = b.increment();
- assertEquals(ref.next().longValue(), x);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java b/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
deleted file mode 100644
index 7ffa690..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.Charset;
-
-import com.google.common.io.Closeables;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.classifier.ClassifierData;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class SplitInputTest extends MahoutTestCase {
-
- private OpenObjectIntHashMap<String> countMap;
- private Charset charset;
- private FileSystem fs;
- private Path tempInputFile;
- private Path tempTrainingDirectory;
- private Path tempTestDirectory;
- private Path tempMapRedOutputDirectory;
- private Path tempInputDirectory;
- private Path tempSequenceDirectory;
- private SplitInput si;
-
- @Override
- @Before
- public void setUp() throws Exception {
- Configuration conf = getConfiguration();
- fs = FileSystem.get(conf);
-
- super.setUp();
-
- countMap = new OpenObjectIntHashMap<>();
-
- charset = Charsets.UTF_8;
- tempSequenceDirectory = getTestTempFilePath("tmpsequence");
- tempInputFile = getTestTempFilePath("bayesinputfile");
- tempTrainingDirectory = getTestTempDirPath("bayestrain");
- tempTestDirectory = getTestTempDirPath("bayestest");
- tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput");
- tempInputDirectory = getTestTempDirPath("bayesinputdir");
-
- si = new SplitInput();
- si.setTrainingOutputDirectory(tempTrainingDirectory);
- si.setTestOutputDirectory(tempTestDirectory);
- si.setInputDirectory(tempInputDirectory);
- }
-
- private void writeMultipleInputFiles() throws IOException {
- Writer writer = null;
- String currentLabel = null;
- try {
- for (String[] entry : ClassifierData.DATA) {
- if (!entry[0].equals(currentLabel)) {
- currentLabel = entry[0];
- Closeables.close(writer, false);
-
- writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(tempInputDirectory, currentLabel)),
- Charsets.UTF_8));
- }
- countMap.adjustOrPutValue(currentLabel, 1, 1);
- writer.write(currentLabel + '\t' + entry[1] + '\n');
- }
- }finally {
- Closeables.close(writer, false);
- }
- }
-
- private void writeSingleInputFile() throws IOException {
- Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8));
- try {
- for (String[] entry : ClassifierData.DATA) {
- writer.write(entry[0] + '\t' + entry[1] + '\n');
- }
- } finally {
- Closeables.close(writer, true);
- }
- }
-
- @Test
- public void testSplitDirectory() throws Exception {
-
- writeMultipleInputFiles();
-
- final int testSplitSize = 1;
- si.setTestSplitSize(testSplitSize);
- si.setCallback(new SplitInput.SplitCallback() {
- @Override
- public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) {
- int trainingLines = countMap.get(inputFile.getName()) - testSplitSize;
- assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory);
- }
- });
-
- si.splitDirectory(tempInputDirectory);
- }
-
- @Test
- public void testSplitFile() throws Exception {
- writeSingleInputFile();
- si.setTestSplitSize(2);
- si.setCallback(new TestCallback(2, 10));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFileLocation() throws Exception {
- writeSingleInputFile();
- si.setTestSplitSize(2);
- si.setSplitLocation(50);
- si.setCallback(new TestCallback(2, 10));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFilePct() throws Exception {
- writeSingleInputFile();
- si.setTestSplitPct(25);
-
- si.setCallback(new TestCallback(3, 9));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFilePctLocation() throws Exception {
- writeSingleInputFile();
- si.setTestSplitPct(25);
- si.setSplitLocation(50);
- si.setCallback(new TestCallback(3, 9));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFileRandomSelectionSize() throws Exception {
- writeSingleInputFile();
- si.setTestRandomSelectionSize(5);
-
- si.setCallback(new TestCallback(5, 7));
- si.splitFile(tempInputFile);
- }
-
- @Test
- public void testSplitFileRandomSelectionPct() throws Exception {
- writeSingleInputFile();
- si.setTestRandomSelectionPct(25);
-
- si.setCallback(new TestCallback(3, 9));
- si.splitFile(tempInputFile);
- }
-
- /**
- * Create a Sequencefile for testing consisting of IntWritable
- * keys and VectorWritable values
- * @param path path for test SequenceFile
- * @param testPoints number of records in test SequenceFile
- */
- private void writeVectorSequenceFile(Path path, int testPoints) throws IOException {
- Path tempSequenceFile = new Path(path, "part-00000");
- Configuration conf = getConfiguration();
- IntWritable key = new IntWritable();
- VectorWritable value = new VectorWritable();
- try (SequenceFile.Writer writer =
- SequenceFile.createWriter(fs, conf, tempSequenceFile, IntWritable.class, VectorWritable.class)) {
- for (int i = 0; i < testPoints; i++) {
- key.set(i);
- Vector v = new SequentialAccessSparseVector(4);
- v.assign(i);
- value.set(v);
- writer.append(key, value);
- }
- }
- }
-
- /**
- * Create a Sequencefile for testing consisting of IntWritable keys and Text values
- * @param path path for test SequenceFile
- * @param testPoints number of records in test SequenceFile
- */
- private void writeTextSequenceFile(Path path, int testPoints) throws IOException {
- Path tempSequenceFile = new Path(path, "part-00000");
- Configuration conf = getConfiguration();
- Text key = new Text();
- Text value = new Text();
- try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, tempSequenceFile, Text.class, Text.class)){
- for (int i = 0; i < testPoints; i++) {
- key.set(Integer.toString(i));
- value.set("Line " + i);
- writer.append(key, value);
- }
- }
- }
-
- /**
- * Display contents of a SequenceFile
- * @param sequenceFilePath path to SequenceFile
- */
- private void displaySequenceFile(Path sequenceFilePath) throws IOException {
- for (Pair<?,?> record : new SequenceFileIterable<>(sequenceFilePath, true, getConfiguration())) {
- System.out.println(record.getFirst() + "\t" + record.getSecond());
- }
- }
-
- /**
- * Determine number of records in a SequenceFile
- * @param sequenceFilePath path to SequenceFile
- * @return number of records
- */
- private int getNumberRecords(Path sequenceFilePath) throws IOException {
- int numberRecords = 0;
- for (Object value : new SequenceFileValueIterable<>(sequenceFilePath, true, getConfiguration())) {
- numberRecords++;
- }
- return numberRecords;
- }
-
- /**
- * Test map reduce version of split input with Text, Text key value
- * pairs in input
- */
- @Test
- public void testSplitInputMapReduceText() throws Exception {
- writeTextSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduce(1000);
- }
-
- /** Test map reduce version of split input with Text, Text key value pairs in input called from command line */
- @Test
- public void testSplitInputMapReduceTextCli() throws Exception {
- writeTextSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduceCli(1000);
- }
-
- /**
- * Test map reduce version of split input with IntWritable, Vector key value
- * pairs in input
- */
- @Test
- public void testSplitInputMapReduceVector() throws Exception {
- writeVectorSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduce(1000);
- }
-
- /**
- * Test map reduce version of split input with IntWritable, Vector key value
- * pairs in input called from command line
- */
- @Test
- public void testSplitInputMapReduceVectorCli() throws Exception {
- writeVectorSequenceFile(tempSequenceDirectory, 1000);
- testSplitInputMapReduceCli(1000);
- }
-
- /**
- * Test map reduce version of split input through CLI
- */
- private void testSplitInputMapReduceCli(int numPoints) throws Exception {
- int randomSelectionPct = 25;
- int keepPct = 10;
- String[] args =
- { "--method", "mapreduce", "--input", tempSequenceDirectory.toString(),
- "--mapRedOutputDir", tempMapRedOutputDirectory.toString(),
- "--randomSelectionPct", Integer.toString(randomSelectionPct),
- "--keepPct", Integer.toString(keepPct), "-ow" };
- ToolRunner.run(getConfiguration(), new SplitInput(), args);
- validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
- }
-
- /**
- * Test map reduce version of split input through method call
- */
- private void testSplitInputMapReduce(int numPoints) throws Exception {
- int randomSelectionPct = 25;
- si.setTestRandomSelectionPct(randomSelectionPct);
- int keepPct = 10;
- si.setKeepPct(keepPct);
- si.setMapRedOutputDirectory(tempMapRedOutputDirectory);
- si.setUseMapRed(true);
- si.splitDirectory(getConfiguration(), tempSequenceDirectory);
-
- validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct);
- }
-
- /**
- * Validate that number of test records and number of training records
- * are consistant with keepPct and randomSelectionPct
- */
- private void validateSplitInputMapReduce(int numPoints, int randomSelectionPct, int keepPct) throws IOException {
- Path testPath = new Path(tempMapRedOutputDirectory, "test-r-00000");
- Path trainingPath = new Path(tempMapRedOutputDirectory, "training-r-00000");
- int numberTestRecords = getNumberRecords(testPath);
- int numberTrainingRecords = getNumberRecords(trainingPath);
- System.out.printf("Test data: %d records\n", numberTestRecords);
- displaySequenceFile(testPath);
- System.out.printf("Training data: %d records\n", numberTrainingRecords);
- displaySequenceFile(trainingPath);
- assertEquals((randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
- numberTestRecords, 2);
- assertEquals(
- (1 - randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints,
- numberTrainingRecords, 2);
- }
-
- @Test
- public void testValidate() throws Exception {
- SplitInput st = new SplitInput();
- assertValidateException(st);
-
- st.setTestSplitSize(100);
- assertValidateException(st);
-
- st.setTestOutputDirectory(tempTestDirectory);
- assertValidateException(st);
-
- st.setTrainingOutputDirectory(tempTrainingDirectory);
- st.validate();
-
- st.setTestSplitPct(50);
- assertValidateException(st);
-
- st = new SplitInput();
- st.setTestRandomSelectionPct(50);
- st.setTestOutputDirectory(tempTestDirectory);
- st.setTrainingOutputDirectory(tempTrainingDirectory);
- st.validate();
-
- st.setTestSplitPct(50);
- assertValidateException(st);
-
- st = new SplitInput();
- st.setTestRandomSelectionPct(50);
- st.setTestOutputDirectory(tempTestDirectory);
- st.setTrainingOutputDirectory(tempTrainingDirectory);
- st.validate();
-
- st.setTestSplitSize(100);
- assertValidateException(st);
- }
-
- private class TestCallback implements SplitInput.SplitCallback {
- private final int testSplitSize;
- private final int trainingLines;
-
- private TestCallback(int testSplitSize, int trainingLines) {
- this.testSplitSize = testSplitSize;
- this.trainingLines = trainingLines;
- }
-
- @Override
- public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) {
- assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory);
- }
- }
-
- private static void assertValidateException(SplitInput st) throws IOException {
- try {
- st.validate();
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException iae) {
- // good
- }
- }
-
- private static void assertSplit(FileSystem fs,
- Path tempInputFile,
- Charset charset,
- int testSplitSize,
- int trainingLines,
- Path tempTrainingDirectory,
- Path tempTestDirectory) {
-
- try {
- Path testFile = new Path(tempTestDirectory, tempInputFile.getName());
- //assertTrue("test file exists", testFile.isFile());
- assertEquals("test line count", testSplitSize, SplitInput.countLines(fs, testFile, charset));
-
- Path trainingFile = new Path(tempTrainingDirectory, tempInputFile.getName());
- //assertTrue("training file exists", trainingFile.isFile());
- assertEquals("training line count", trainingLines, SplitInput.countLines(fs, trainingFile, charset));
- } catch (IOException ioe) {
- fail(ioe.toString());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java b/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
deleted file mode 100644
index c519f85..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.email;
-
-import java.io.File;
-import java.io.StringWriter;
-import java.net.URL;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class MailProcessorTest extends MahoutTestCase {
-
- @Test
- public void testLabel() throws Exception {
- StringWriter writer = new StringWriter();
- MailOptions options = new MailOptions();
- options.setSeparator(":::");
- options.setCharset(Charsets.UTF_8);
- options.setPatternsToMatch(new Pattern[]{
- MailProcessor.FROM_PREFIX, MailProcessor.SUBJECT_PREFIX, MailProcessor.TO_PREFIX});
- options.setInput(new File(System.getProperty("user.dir")));
- MailProcessor proc = new MailProcessor(options, "", writer);
- URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox");
- File file = new File(url.toURI());
- long count = proc.parseMboxLineByLine(file);
- assertEquals(7, count);
- }
-
- @Test
- public void testStripQuoted() throws Exception {
- StringWriter writer = new StringWriter();
- MailOptions options = new MailOptions();
- options.setSeparator(":::");
- options.setCharset(Charsets.UTF_8);
- options.setPatternsToMatch(new Pattern[]{
- MailProcessor.SUBJECT_PREFIX});
- options.setInput(new File(System.getProperty("user.dir")));
- options.setIncludeBody(true);
- MailProcessor proc = new MailProcessor(options, "", writer);
- URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox");
- File file = new File(url.toURI());
- long count = proc.parseMboxLineByLine(file);
- assertEquals(7, count);
- assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));
- writer = new StringWriter();
- proc = new MailProcessor(options, "", writer);
- options.setStripQuotedText(true);
- count = proc.parseMboxLineByLine(file);
- assertEquals(7, count);
- assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
deleted file mode 100644
index 4fdbbbc..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.mahout.utils.nlp.collocations.llr;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetEncoder;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.util.bloom.BloomFilter;
-import org.apache.hadoop.util.bloom.Filter;
-import org.apache.hadoop.util.bloom.Key;
-import org.apache.hadoop.util.hash.Hash;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class BloomTokenFilterTest extends MahoutTestCase {
-
- private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder();
-
- private static final String input = "The best of times the worst of times";
- private static final String[] allTokens = {
- "The", "best", "of", "times", "the", "worst", "of", "times"
- };
- private static final String[] expectedNonKeepTokens = { "best", "times", "the", "worst", "times" };
- private static final String[] expectedKeepTokens = { "The", "of", "of" };
- private static final String[] filterTokens = { "The", "of" };
- private static final String[] notFilterTokens = { "best", "worst", "the", "times"};
- private static final String[] shingleKeepTokens = {
- "The best", "best of times", "the worst", "worst of times", "of times"
- };
- private static final String[] expectedShingleTokens = {
- "The best", "best of times", "of times", "the worst", "worst of times", "of times"
- };
-
- /** test standalone filter without tokenfilter wrapping */
- @Test
- public void testFilter() throws IOException {
- Filter filter = getFilter(filterTokens);
- Key k = new Key();
- for (String s: filterTokens) {
- setKey(k,s);
- assertTrue("Key for string " + s + " should be filter member", filter.membershipTest(k));
- }
-
- for (String s: notFilterTokens) {
- setKey(k,s);
- assertFalse("Key for string " + s + " should not be filter member", filter.membershipTest(k));
- }
- }
-
- /** normal case, unfiltered analyzer */
- @Test
- public void testAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- validateTokens(allTokens, ts);
- ts.end();
- ts.close();
- }
-
- /** filtered analyzer */
- @Test
- public void testNonKeepdAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts);
- validateTokens(expectedNonKeepTokens, f);
- ts.end();
- ts.close();
- }
-
- /** keep analyzer */
- @Test
- public void testKeepAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts);
- validateTokens(expectedKeepTokens, f);
- ts.end();
- ts.close();
- }
-
- /** shingles, keep those matching whitelist */
- @Test
- public void testShingleFilteredAnalyzer() throws IOException {
- Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer();
- TokenStream ts = analyzer.tokenStream(null, reader);
- ts.reset();
- ShingleFilter sf = new ShingleFilter(ts, 3);
- TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf);
- validateTokens(expectedShingleTokens, f);
- ts.end();
- ts.close();
- }
-
- private static void setKey(Key k, String s) throws IOException {
- ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray()));
- k.set(buffer.array(), 1.0);
- }
-
- private static void validateTokens(String[] expected, TokenStream ts) throws IOException {
- int pos = 0;
- while (ts.incrementToken()) {
- assertTrue("Analyzer produced too many tokens", pos <= expected.length);
- CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
- assertEquals("Unexpected term", expected[pos++], termAttr.toString());
- }
- assertEquals("Analyzer produced too few terms", expected.length, pos);
- }
-
- private static Filter getFilter(String[] tokens) throws IOException {
- Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH);
- Key k = new Key();
- for (String s: tokens) {
- setKey(k,s);
- filter.add(k);
- }
- return filter;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java b/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
deleted file mode 100644
index 8ab643b..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.common.DummyRecordWriter;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.List;
-
-public final class RegexMapperTest extends MahoutTestCase {
-
- @Test
- public void testRegex() throws Exception {
- RegexMapper mapper = new RegexMapper();
- Configuration conf = getConfiguration();
- conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
- conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
- DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
- Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
- .build(mapper, conf, mapWriter);
-
- mapper.setup(mapContext);
- for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
- String testStr = RegexUtilsTest.TEST_STRS[i];
-
- LongWritable key = new LongWritable(i);
- mapper.map(key, new Text(testStr), mapContext);
- List<Text> value = mapWriter.getValue(key);
- if (!RegexUtilsTest.GOLD[i].isEmpty()) {
- assertEquals(1, value.size());
- assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString());
- }
- }
- }
-
- @Test
- public void testGroups() throws Exception {
- RegexMapper mapper = new RegexMapper();
- Configuration conf = getConfiguration();
- conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)");
- conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
- conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3");
- DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
- Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
- .build(mapper, conf, mapWriter);
-
- mapper.setup(mapContext);
- for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
- String testStr = RegexUtilsTest.TEST_STRS[i];
-
- LongWritable key = new LongWritable(i);
- mapper.map(key, new Text(testStr), mapContext);
- List<Text> value = mapWriter.getValue(key);
- assertEquals(1, value.size());
- assertEquals("127 0", value.get(0).toString());
- }
- }
-
- @Test
- public void testFPGFormatter() throws Exception {
- RegexMapper mapper = new RegexMapper();
- Configuration conf = getConfiguration();
- conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
- conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName());
- conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName());
- DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>();
- Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter
- .build(mapper, conf, mapWriter);
-
- mapper.setup(mapContext);
- RegexFormatter formatter = new FPGFormatter();
- for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
- String testStr = RegexUtilsTest.TEST_STRS[i];
-
- LongWritable key = new LongWritable(i);
- mapper.map(key, new Text(testStr), mapContext);
- List<Text> value = mapWriter.getValue(key);
- if (!RegexUtilsTest.GOLD[i].isEmpty()) {
- assertEquals(1, value.size());
- assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), value.get(0).toString());
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java b/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
deleted file mode 100644
index 8ae10a5..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.regex;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.regex.Pattern;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class RegexUtilsTest extends MahoutTestCase {
-
- static final String[] TEST_STRS = {
- "127.0.0.1 - - [01/10/2011:00:01:51 +0000] \"GET /solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content",
- "127.0.0.1 - - [01/10/2011:00:20:58 +0000] \"GET /solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10",
- "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 45071",
- "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071"
- };
- static final String[] GOLD = {"foo", "Using Solr Search RDBMS", "language detection", ""};
-
- @Test
- public void testExtract() throws Exception {
- Pattern pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)");
- String line = "127.0.0.1 - - [24/05/2010:01:19:22 +0000] \"GET /solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571";
- String res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
- assertEquals(res, "import statement", res);
-
- for (int i = 0; i < TEST_STRS.length; i++) {
- String testStr = TEST_STRS[i];
- res = RegexUtils.extract(testStr, pattern, Collections.<Integer>emptyList(), " ", new URLDecodeTransformer());
- assertEquals(GOLD[i], res);
- }
-
- pattern = Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))");
- res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER);
- assertEquals(res, "import statement 1", res);
-
- pattern = Pattern.compile("(start=1) HTTP");
- Collection<Integer> groupsToKeep = new ArrayList<>();
- groupsToKeep.add(1);
- res = RegexUtils.extract(line, pattern, groupsToKeep, " ", RegexUtils.IDENTITY_TRANSFORMER);
- assertEquals(res, "start=1", res);
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java b/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
deleted file mode 100644
index 2ddce14..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Iterator;
-import java.util.Random;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterators;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.iterator.CountingIterator;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.DoubleFunction;
-
-public final class RandomVectorIterable implements Iterable<Vector> {
-
- public enum VectorType {DENSE, SPARSE}
-
- private final int numItems;
- private final VectorType type;
-
- public RandomVectorIterable() {
- this(100, VectorType.SPARSE);
- }
-
- public RandomVectorIterable(int numItems) {
- this(numItems, VectorType.SPARSE);
- }
-
- public RandomVectorIterable(int numItems, VectorType type) {
- this.numItems = numItems;
- this.type = type;
- }
-
- @Override
- public Iterator<Vector> iterator() {
- return Iterators.transform(
- new CountingIterator(numItems),
- new Function<Integer, Vector>() {
- private final Random random = RandomUtils.getRandom();
- @Override
- public Vector apply(Integer dummy) {
- Vector result =
- type == VectorType.SPARSE ? new RandomAccessSparseVector(numItems) : new DenseVector(numItems);
- result.assign(new DoubleFunction() {
- @Override
- public double apply(double ignored) {
- return random.nextDouble();
- }
- });
- return result;
- }
- });
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
deleted file mode 100644
index c55fd8d..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors;
-
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class VectorHelperTest extends MahoutTestCase {
-
- private static final int NUM_DOCS = 100;
-
- private Path inputPathOne;
- private Path inputPathTwo;
-
- private Configuration conf;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- conf = getConfiguration();
-
- inputPathOne = getTestTempFilePath("documents/docs-one.file");
- FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf);
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class)) {
- Random rd = RandomUtils.getRandom();
- for (int i = 0; i < NUM_DOCS; i++) {
- // Make all indices higher than dictionary size
- writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS)));
- }
- }
-
- inputPathTwo = getTestTempFilePath("documents/docs-two.file");
- fs = FileSystem.get(inputPathTwo.toUri(), conf);
- try (SequenceFile.Writer writer =
- new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class)) {
- Random rd = RandomUtils.getRandom();
- for (int i = 0; i < NUM_DOCS; i++) {
- // Keep indices within number of documents
- writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS)));
- }
- }
- }
-
- @Test
- public void testJsonFormatting() throws Exception {
- Vector v = new SequentialAccessSparseVector(10);
- v.set(2, 3.1);
- v.set(4, 1.0);
- v.set(6, 8.1);
- v.set(7, -100);
- v.set(9, 12.2);
- String UNUSED = "UNUSED";
- String[] dictionary = {
- UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, "nine"
- };
-
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}",
- VectorHelper.vectorToJson(v, dictionary, 3, true));
- assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
- VectorHelper.vectorToJson(v, dictionary, 2, false));
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0}",
- VectorHelper.vectorToJson(v, dictionary, 4, true));
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}",
- VectorHelper.vectorToJson(v, dictionary, 5, true));
- assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}",
- VectorHelper.vectorToJson(v, dictionary, 2, true));
- assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}",
- VectorHelper.vectorToJson(v, dictionary, 2, false));
- }
-
- @Test
- public void testTopEntries() throws Exception {
- Vector v = new SequentialAccessSparseVector(10);
- v.set(2, 3.1);
- v.set(4, 1.0);
- v.set(6, 8.1);
- v.set(7, -100);
- v.set(9, 12.2);
- v.set(1, 0.0);
- v.set(3, 0.0);
- v.set(8, 2.7);
- // check if sizeOFNonZeroElementsInVector = maxEntries
- assertEquals(6, VectorHelper.topEntries(v, 6).size());
- // check if sizeOfNonZeroElementsInVector < maxEntries
- assertTrue(VectorHelper.topEntries(v, 9).size() < 9);
- // check if sizeOfNonZeroElementsInVector > maxEntries
- assertTrue(VectorHelper.topEntries(v, 5).size() < v.getNumNonZeroElements());
- }
-
- @Test
- public void testTopEntriesWhenAllZeros() throws Exception {
- Vector v = new SequentialAccessSparseVector(10);
- v.set(2, 0.0);
- v.set(4, 0.0);
- v.set(6, 0.0);
- v.set(7, 0);
- v.set(9, 0.0);
- v.set(1, 0.0);
- v.set(3, 0.0);
- v.set(8, 0.0);
- assertEquals(0, VectorHelper.topEntries(v, 6).size());
- }
-
- @Test
- public void testLoadTermDictionary() throws Exception {
- // With indices higher than dictionary size
- VectorHelper.loadTermDictionary(conf, inputPathOne.toString());
- // With dictionary size higher than indices
- VectorHelper.loadTermDictionary(conf, inputPathTwo.toString());
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
deleted file mode 100644
index 2ea8b89..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-public final class ARFFTypeTest extends MahoutTestCase {
-
- @Test
- public void removeQuotes() {
- assertNull(ARFFType.removeQuotes(null));
- assertEquals("", ARFFType.removeQuotes("\"\""));
- assertEquals("", ARFFType.removeQuotes("''"));
- assertEquals("", ARFFType.removeQuotes(""));
- assertEquals("", ARFFType.removeQuotes(" "));
- assertEquals("single", ARFFType.removeQuotes("'single'"));
- assertEquals("double", ARFFType.removeQuotes("\"double\""));
- assertEquals("trim", ARFFType.removeQuotes(" trim "));
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
deleted file mode 100644
index 4c7f17a..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
+++ /dev/null
@@ -1,289 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.IOException;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Test;
-
-public final class ARFFVectorIterableTest extends MahoutTestCase {
-
- @Test
- public void testValues() throws Exception {
- ARFFVectorIterable iterable = readModelFromResource("sample.arff");
-
- assertEquals("Mahout", iterable.getModel().getRelation());
- Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
- assertNotNull(bindings);
- assertEquals(5, bindings.size());
- Iterator<Vector> iter = iterable.iterator();
- assertTrue(iter.hasNext());
- Vector next = iter.next();
- assertNotNull(next);
- assertTrue("Wrong instanceof", next instanceof DenseVector);
- assertEquals(1.0, next.get(0), EPSILON);
- assertEquals(2.0, next.get(1), EPSILON);
- assertTrue(iter.hasNext());
- next = iter.next();
- assertNotNull(next);
- assertTrue("Wrong instanceof", next instanceof DenseVector);
- assertEquals(2.0, next.get(0), EPSILON);
- assertEquals(3.0, next.get(1), EPSILON);
-
- assertTrue(iter.hasNext());
- next = iter.next();
- assertNotNull(next);
- assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector);
- assertEquals(5.0, next.get(0), EPSILON);
- assertEquals(23.0, next.get(1), EPSILON);
-
- assertFalse(iter.hasNext());
- }
-
- @Test
- public void testDense() throws Exception {
- Iterable<Vector> iterable = readModelFromResource("sample-dense.arff");
- Vector firstVector = iterable.iterator().next();
- assertEquals(1.0, firstVector.get(0), 0);
- assertEquals(65.0, firstVector.get(1), 0);
- assertEquals(1.0, firstVector.get(3), 0);
- assertEquals(1.0, firstVector.get(4), 0);
-
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof DenseVector);
- count++;
- }
- assertEquals(5, count);
- }
-
- @Test
- public void testSparse() throws Exception {
- Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff");
-
- Vector firstVector = iterable.iterator().next();
- assertEquals(23.1, firstVector.get(1), 0);
- assertEquals(3.23, firstVector.get(2), 0);
- assertEquals(1.2, firstVector.get(3), 0);
-
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
- assertEquals(9, count);
- }
-
- @Test
- public void testNonNumeric() throws Exception {
- MapBackedARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
-
- iterable = getVectors("non-numeric-1.arff", model);
- Iterator<Vector> iter = iterable.iterator();
- Vector firstVector = iter.next();
-
- assertEquals(1.0, firstVector.get(2), 0);
-
- assertEquals(10, count);
- Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
- assertNotNull(nominalMap);
- assertEquals(1, nominalMap.size());
- Map<String, Integer> noms = nominalMap.get("bar");
- assertNotNull("nominals for bar are null", noms);
- assertEquals(5, noms.size());
- Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
- assertNotNull("Type map null", integerARFFTypeMap);
- assertEquals(5, integerARFFTypeMap.size());
- Map<String, Long> words = model.getWords();
- assertNotNull("words null", words);
- assertEquals(10, words.size());
- Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
- assertNotNull("date format null", integerDateFormatMap);
- assertEquals(1, integerDateFormatMap.size());
- }
-
- @Test
- public void testDate() throws Exception {
- ARFFVectorIterable iterable = readModelFromResource("date.arff");
- Iterator<Vector> iter = iterable.iterator();
- Vector firstVector = iter.next();
-
- DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
- Date date = format.parse("2001-07-04T12:08:56");
- long result = date.getTime();
- assertEquals(result, firstVector.get(1), 0);
-
- format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", Locale.ENGLISH);
- date = format.parse("2001.07.04 AD at 12:08:56 PDT");
- result = date.getTime();
- assertEquals(result, firstVector.get(2), 0);
-
- format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH);
- date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT");
- result = date.getTime();
- assertEquals(result, firstVector.get(3), 0);
-
- format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH);
- date = format.parse("0:08 PM, PDT");
- result = date.getTime();
- assertEquals(result, firstVector.get(4), 0);
-
- format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", Locale.ENGLISH);
- date = format.parse("02001.July.04 AD 12:08 PM");
- result = date.getTime();
- assertEquals(result, firstVector.get(5), 0);
-
- format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH);
- date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700");
- result = date.getTime();
- assertEquals(result, firstVector.get(6), 0);
-
- }
-
- @Test
- public void testMultipleNoms() throws Exception {
- MapBackedARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
- assertEquals(10, count);
- Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap();
- assertNotNull(nominalMap);
- assertEquals(1, nominalMap.size());
- Map<String,Integer> noms = nominalMap.get("bar");
- assertNotNull("nominals for bar are null", noms);
- assertEquals(5, noms.size());
- Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
- assertNotNull("Type map null", integerARFFTypeMap);
- assertEquals(5, integerARFFTypeMap.size());
- Map<String,Long> words = model.getWords();
- assertNotNull("words null", words);
- assertEquals(10, words.size());
-
- Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap();
- assertNotNull("date format null", integerDateFormatMap);
- assertEquals(1, integerDateFormatMap.size());
-
-
- iterable = getVectors("non-numeric-2.arff", model);
- count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
- count++;
- }
- nominalMap = model.getNominalMap();
- assertNotNull(nominalMap);
- assertEquals(2, nominalMap.size());
- noms = nominalMap.get("test");
- assertNotNull("nominals for bar are null", noms);
- assertEquals(2, noms.size());
- }
-
- @Test
- public void testNumerics() throws Exception {
- String arff = "@RELATION numerics\n"
- + "@ATTRIBUTE theNumeric NUMERIC\n"
- + "@ATTRIBUTE theInteger INTEGER\n"
- + "@ATTRIBUTE theReal REAL\n"
- + "@DATA\n"
- + "1.0,2,3.0";
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
- model = iterable.getModel();
- assertNotNull(model);
- assertEquals(3, model.getLabelSize());
- assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
- assertEquals(ARFFType.INTEGER, model.getARFFType(1));
- assertEquals(ARFFType.REAL, model.getARFFType(2));
- Iterator<Vector> it = iterable.iterator();
- Vector vector = it.next();
- assertEquals(1.0, vector.get(0), EPSILON);
- assertEquals(2.0, vector.get(1), EPSILON);
- assertEquals(3.0, vector.get(2), EPSILON);
- }
-
- @Test
- public void testQuotes() throws Exception {
- // ARFF allows quotes on identifiers
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = getVectors("quoted-id.arff", model);
- model = iterable.getModel();
- assertNotNull(model);
- assertEquals("quotes", model.getRelation());
-
- // check attribute labels
- assertEquals(4, model.getLabelSize());
- assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
- assertEquals(ARFFType.INTEGER, model.getARFFType(1));
- assertEquals(ARFFType.REAL, model.getARFFType(2));
- assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
-
- Map<String, Integer> labelBindings = model.getLabelBindings();
- assertTrue(labelBindings.keySet().contains("thenumeric"));
- assertTrue(labelBindings.keySet().contains("theinteger"));
- assertTrue(labelBindings.keySet().contains("thereal"));
- assertTrue(labelBindings.keySet().contains("thenominal"));
-
- // check nominal values
- Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
- assertNotNull(nominalMap);
- assertEquals(3, nominalMap.size());
- assertTrue(nominalMap.keySet().contains("double-quote"));
- assertTrue(nominalMap.keySet().contains("single-quote"));
- assertTrue(nominalMap.keySet().contains("no-quote"));
-
- // check data values
- Iterator<Vector> it = iterable.iterator();
- Vector vector = it.next();
- assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
- assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
- assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
- }
-
- static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException {
- String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8);
- return new ARFFVectorIterable(sample, model);
- }
-
- private static ARFFVectorIterable readModelFromResource(String resourceName) throws IOException {
- ARFFModel model = new MapBackedARFFModel();
- return getVectors(resourceName, model);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
deleted file mode 100644
index 7e7623e..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils.vectors.arff;
-
-import java.io.IOException;
-import java.io.StringWriter;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-/**
- * Test case for {@link Driver}
- */
-public class DriverTest extends MahoutTestCase {
-
- @Test
- public void dictionary() throws IOException {
-
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
- StringWriter writer = new StringWriter();
- Driver.writeLabelBindings(writer, model, ",");
- String expected1 = Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), Charsets.UTF_8);
- String expected2 = Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), Charsets.UTF_8);
- assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString()));
- }
-
-
- @Test
- public void dictionaryJSON() throws IOException {
- ARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
- StringWriter writer = new StringWriter();
- Driver.writeLabelBindingsJSON(writer, model);
- String expected1 = Resources.toString(Resources.getResource("expected-arff-schema.json"), Charsets.UTF_8);
- String expected2 = Resources.toString(Resources.getResource("expected-arff-schema-2.json"), Charsets.UTF_8);
- assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString()));
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
deleted file mode 100644
index 2867640..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.arff;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.util.Map;
-
-public class MapBackedARFFModelTest extends MahoutTestCase {
-
- @Test
- public void processNominal() {
- String windy = "windy";
- String breezy = "breezy";
-
- ARFFModel model = new MapBackedARFFModel();
- model.addNominal(windy, breezy, 77);
- model.addNominal(windy, "strong", 23);
- model.addNominal(windy, "nuking", 55);
- Map<String, Map<String, Integer>> nominalMap = model.getNominalMap();
-
- assertEquals(1, nominalMap.size());
- Map<String, Integer> windyValues = nominalMap.get(windy);
- assertEquals(77, windyValues.get(breezy).intValue());
- }
-
- @Test
- public void processBadNumeric() {
- ARFFModel model = new MapBackedARFFModel();
- model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77);
- model.addType(77, ARFFType.REAL);
- assertTrue(Double.isNaN(model.getValue("b1shkt70694difsmmmdv0ikmoh", 77)));
- }
-
- @Test
- public void processGoodNumeric() {
- ARFFModel model = new MapBackedARFFModel();
- model.addLabel("1234", 77);
- model.addType(77, ARFFType.INTEGER);
- assertTrue(1234 == model.getValue("1234", 77));
- model.addLabel("131.34", 78);
- model.addType(78, ARFFType.REAL);
- assertTrue(131.34 == model.getValue("131.34", 78));
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
deleted file mode 100644
index e76cf70..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.csv;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.io.StringWriter;
-import java.util.Iterator;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.RandomVectorIterable;
-import org.apache.mahout.utils.vectors.VectorHelper;
-import org.apache.mahout.utils.vectors.io.TextualVectorWriter;
-import org.junit.Test;
-
-public class CSVVectorIteratorTest extends MahoutTestCase {
-
- @Test
- public void testCount() throws Exception {
-
- StringWriter sWriter = new StringWriter();
- try (TextualVectorWriter writer = new TextualVectorWriter(sWriter) {
- @Override
- public void write(Vector vector) throws IOException {
- String vecStr = VectorHelper.vectorToCSVString(vector, false);
- getWriter().write(vecStr);
- }
- }) {
- Iterable<Vector> iter = new RandomVectorIterable(50);
- writer.write(iter);
- }
-
- Iterator<Vector> csvIter = new CSVVectorIterator(new StringReader(sWriter.getBuffer().toString()));
- int count = 0;
- while (csvIter.hasNext()) {
- csvIter.next();
- count++;
- }
- assertEquals(50, count);
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
deleted file mode 100644
index e2f7032..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.io;
-
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Collection;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.vectors.RandomVectorIterable;
-import org.junit.Test;
-
-public final class VectorWriterTest extends MahoutTestCase {
-
- @Test
- public void testSFVW() throws Exception {
- Path path = getTestTempFilePath("sfvw");
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
- try (SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter)) {
- writer.write(new RandomVectorIterable(50));
- }
-
- long count = HadoopUtil.countRecords(path, conf);
- assertEquals(50, count);
- }
-
- @Test
- public void testTextOutputSize() throws Exception {
- StringWriter strWriter = new StringWriter();
- try (VectorWriter writer = new TextualVectorWriter(strWriter)) {
- Collection<Vector> vectors = new ArrayList<>();
- vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5}));
- vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5}));
- writer.write(vectors);
- }
- String buffer = strWriter.toString();
- assertNotNull(buffer);
- assertFalse(buffer.isEmpty());
-
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
deleted file mode 100644
index 890a14b..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-
-import java.io.IOException;
-
-import com.google.common.io.Closeables;
-
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-public class CachedTermInfoTest extends MahoutTestCase {
- private RAMDirectory directory;
- private static final String[] DOCS = {
- "a a b b c c",
- "a b a b a b a b",
- "a b a",
- "a",
- "b",
- "a",
- "a"
- };
-
- private static final String[] DOCS2 = {
- "d d d d",
- "e e e e",
- "d e d e",
- "d",
- "e",
- "d",
- "e"
- };
-
- @Before
- public void before() throws IOException {
- directory = new RAMDirectory();
-
- FieldType fieldType = new FieldType();
- fieldType.setStored(false);
- fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- fieldType.setTokenized(true);
- fieldType.setStoreTermVectors(false);
- fieldType.setStoreTermVectorPositions(false);
- fieldType.setStoreTermVectorOffsets(false);
- fieldType.freeze();
-
- directory = createTestIndex(fieldType, directory, 0);
- }
-
- @Test
- public void test() throws Exception {
- IndexReader reader = DirectoryReader.open(directory);
- CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100);
- assertEquals(3, cti.totalTerms("content"));
- assertNotNull(cti.getTermEntry("content", "a"));
- assertNull(cti.getTermEntry("content", "e"));
- //minDf
- cti = new CachedTermInfo(reader, "content", 3, 100);
- assertEquals(2, cti.totalTerms("content"));
- assertNotNull(cti.getTermEntry("content", "a"));
- assertNull(cti.getTermEntry("content", "c"));
- //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to exclude, 85% should suffice to exclude a
- cti = new CachedTermInfo(reader, "content", 0, 85);
- assertEquals(2, cti.totalTerms("content"));
- assertNotNull(cti.getTermEntry("content", "b"));
- assertNotNull(cti.getTermEntry("content", "c"));
- assertNull(cti.getTermEntry("content", "a"));
-
-
- }
-
- static RAMDirectory createTestIndex(FieldType fieldType,
- RAMDirectory directory,
- int startingId) throws IOException {
- IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer()));
-
- try {
- for (int i = 0; i < DOCS.length; i++) {
- Document doc = new Document();
- Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
- doc.add(id);
- Field text = new Field("content", DOCS[i], fieldType);
- doc.add(text);
- Field text2 = new Field("content2", DOCS2[i], fieldType);
- doc.add(text2);
- writer.addDocument(doc);
- }
- } finally {
- Closeables.close(writer, false);
- }
- return directory;
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
deleted file mode 100644
index 86c8305..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import com.google.common.collect.Sets;
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Paths;
-import java.util.Set;
-
-public class DriverTest extends MahoutTestCase {
-
- private File indexDir;
- private File outputDir;
- private Configuration conf;
-
- @Before
- @Override
- public void setUp() throws Exception {
- super.setUp();
- indexDir = getTestTempDir("intermediate");
- indexDir.delete();
- outputDir = getTestTempDir("output");
- outputDir.delete();
-
- conf = getConfiguration();
- }
-
- private Document asDocument(String line) {
- Document doc = new Document();
- doc.add(new TextFieldWithTermVectors("text", line));
- return doc;
- }
-
- static class TextFieldWithTermVectors extends Field {
-
- public static final FieldType TYPE = new FieldType();
-
- static {
- TYPE.setOmitNorms(true);
- TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
- TYPE.setStored(true);
- TYPE.setTokenized(true);
- TYPE.setStoreTermVectors(true);
- TYPE.freeze();
- }
-
- public TextFieldWithTermVectors(String name, String value) {
- super(name, value, TYPE);
- }
- }
-
- @Test
- public void sequenceFileDictionary() throws IOException {
-
- Directory index = new SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath()));
- Analyzer analyzer = new StandardAnalyzer();
- IndexWriterConfig config = new IndexWriterConfig(analyzer);
- config.setCommitOnClose(true);
- final IndexWriter writer = new IndexWriter(index, config);
-
- try {
- writer.addDocument(asDocument("One Ring to rule them all"));
- writer.addDocument(asDocument("One Ring to find them,"));
- writer.addDocument(asDocument("One Ring to bring them all"));
- writer.addDocument(asDocument("and in the darkness bind them"));
- } finally {
- writer.close();
- }
-
- File seqDict = new File(outputDir, "dict.seq");
-
- Driver.main(new String[] {
- "--dir", indexDir.getAbsolutePath(),
- "--output", new File(outputDir, "out").getAbsolutePath(),
- "--field", "text",
- "--dictOut", new File(outputDir, "dict.txt").getAbsolutePath(),
- "--seqDictOut", seqDict.getAbsolutePath(),
- });
-
- SequenceFile.Reader reader = null;
- Set<String> indexTerms = Sets.newHashSet();
- try {
- reader = new SequenceFile.Reader(FileSystem.getLocal(conf), new Path(seqDict.getAbsolutePath()), conf);
- Text term = new Text();
- IntWritable termIndex = new IntWritable();
-
- while (reader.next(term, termIndex)) {
- indexTerms.add(term.toString());
- }
- } finally {
- Closeables.close(reader, true);
- }
-
- Set<String> expectedIndexTerms = Sets.newHashSet("all", "bind", "bring", "darkness", "find", "one", "ring", "rule");
-
- // should contain the same terms as expected
- assertEquals(expectedIndexTerms.size(), Sets.union(expectedIndexTerms, indexTerms).size());
- }
-}