You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ignite.apache.org by za...@apache.org on 2019/08/16 13:22:04 UTC
[ignite] branch master updated: IGNITE-10697: [ML] Add Frequency
Encoding (#6784)
This is an automated email from the ASF dual-hosted git repository.
zaleslaw pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ignite.git
The following commit(s) were added to refs/heads/master by this push:
new fb2e1e2 IGNITE-10697: [ML] Add Frequency Encoding (#6784)
fb2e1e2 is described below
commit fb2e1e28818d3d96a7a9fcc86043d28f28e76e47
Author: Alexey Zinoviev <za...@gmail.com>
AuthorDate: Fri Aug 16 16:21:55 2019 +0300
IGNITE-10697: [ML] Add Frequency Encoding (#6784)
---
.../ml/preprocessing/encoding/EncoderTrainer.java | 70 ++++++++++++-----
.../ml/preprocessing/encoding/EncoderType.java | 5 +-
.../frequency/FrequencyEncoderPreprocessor.java | 89 ++++++++++++++++++++++
.../package-info.java} | 15 +---
.../ml/preprocessing/PreprocessingTestSuite.java | 4 +-
.../preprocessing/encoding/EncoderTrainerTest.java | 31 ++++++++
.../encoding/FrequencyEncoderPreprocessorTest.java | 82 ++++++++++++++++++++
7 files changed, 264 insertions(+), 32 deletions(-)
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java
index 2e6442d..5703ea0 100644
--- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java
@@ -32,6 +32,7 @@ import org.apache.ignite.ml.dataset.primitive.context.EmptyContext;
import org.apache.ignite.ml.environment.LearningEnvironmentBuilder;
import org.apache.ignite.ml.preprocessing.PreprocessingTrainer;
import org.apache.ignite.ml.preprocessing.Preprocessor;
+import org.apache.ignite.ml.preprocessing.encoding.frequency.FrequencyEncoderPreprocessor;
import org.apache.ignite.ml.preprocessing.encoding.onehotencoder.OneHotEncoderPreprocessor;
import org.apache.ignite.ml.preprocessing.encoding.stringencoder.StringEncoderPreprocessor;
import org.apache.ignite.ml.structures.LabeledVector;
@@ -71,19 +72,19 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
- categoryFrequencies = calculateFrequencies(row, categoryFrequencies);
+ categoryFrequencies = updateFrequenciesForNextRow(row, categoryFrequencies);
}
return new EncoderPartitionData()
.withCategoryFrequencies(categoryFrequencies);
}
)) {
- Map<String, Integer>[] encodingValues = calculateEncodingValuesByFrequencies(dataset);
-
switch (encoderType) {
case ONE_HOT_ENCODER:
- return new OneHotEncoderPreprocessor<>(encodingValues, basePreprocessor, handledIndices);
+ return new OneHotEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
case STRING_ENCODER:
- return new StringEncoderPreprocessor<>(encodingValues, basePreprocessor, handledIndices);
+ return new StringEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
+ case FREQUENCY_ENCODER:
+ return new FrequencyEncoderPreprocessor<>(calculateEncodingFrequencies(dataset), basePreprocessor, handledIndices);
default:
throw new IllegalStateException("Define the type of the resulting prerocessor.");
}
@@ -94,14 +95,38 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
}
/**
- * Calculates the encoding values values by frequencies keeping in the given dataset.
+ * Calculates encoding frequencies as frequency divided on amount of rows in dataset.
*
- * @param dataset The dataset of frequencies for each feature aggregated in each partition.
- * @return Encoding values for each feature.
+ * NOTE: The amount of rows is calculated as sum of absolute frequencies.
+ *
+ * @param dataset Dataset.
+ * @return Encoding frequency for each feature.
*/
- private Map<String, Integer>[] calculateEncodingValuesByFrequencies(
- Dataset<EmptyContext, EncoderPartitionData> dataset) {
- Map<String, Integer>[] frequencies = dataset.compute(
+ private Map<String, Double>[] calculateEncodingFrequencies(Dataset<EmptyContext, EncoderPartitionData> dataset) {
+ Map<String, Integer>[] frequencies = calculateFrequencies(dataset);
+
+ Map<String, Double>[] res = new Map[frequencies.length];
+
+ int[] counters = new int[frequencies.length];
+
+ for (int i = 0; i < frequencies.length; i++) {
+ counters[i] = frequencies[i].values().stream().reduce(0, Integer::sum);
+ int locI = i;
+ res[locI] = new HashMap<>();
+ frequencies[i].forEach((k, v) -> res[locI].put(k, (double)v / counters[locI]));
+ }
+
+ return res;
+ }
+
+ /**
+ * Calculates frequencies for each feature.
+ *
+ * @param dataset Dataset.
+ * @return Frequency for each feature.
+ */
+ private Map<String, Integer>[] calculateFrequencies(Dataset<EmptyContext, EncoderPartitionData> dataset) {
+ return dataset.compute(
EncoderPartitionData::categoryFrequencies,
(a, b) -> {
if (a == null)
@@ -121,8 +146,19 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
return b;
}
);
+ }
+
+ /**
+ * Calculates the encoding values values by frequencies keeping in the given dataset.
+ *
+ * @param dataset The dataset of frequencies for each feature aggregated in each partition.
+ * @return Encoding values for each feature.
+ */
+ private Map<String, Integer>[] calculateEncodingValuesByFrequencies(
+ Dataset<EmptyContext, EncoderPartitionData> dataset) {
+ Map<String, Integer>[] frequencies = calculateFrequencies(dataset);
- Map<String, Integer>[] res = new HashMap[frequencies.length];
+ Map<String, Integer>[] res = new Map[frequencies.length];
for (int i = 0; i < frequencies.length; i++)
if (handledIndices.contains(i))
@@ -140,10 +176,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
private Map<String, Integer> transformFrequenciesToEncodingValues(Map<String, Integer> frequencies) {
Comparator<Map.Entry<String, Integer>> comp;
- if (encoderSortingStgy.equals(EncoderSortingStrategy.FREQUENCY_DESC))
- comp = Map.Entry.comparingByValue();
- else
- comp = Collections.reverseOrder(Map.Entry.comparingByValue());
+ comp = encoderSortingStgy == EncoderSortingStrategy.FREQUENCY_DESC ? Map.Entry.comparingByValue() : Collections.reverseOrder(Map.Entry.comparingByValue());
final HashMap<String, Integer> resMap = frequencies.entrySet()
.stream()
@@ -166,7 +199,8 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
* @param categoryFrequencies Holds the frequencies of categories by values and features.
* @return Updated frequencies by values and features.
*/
- private Map<String, Integer>[] calculateFrequencies(LabeledVector row, Map<String, Integer>[] categoryFrequencies) {
+ private Map<String, Integer>[] updateFrequenciesForNextRow(LabeledVector row,
+ Map<String, Integer>[] categoryFrequencies) {
if (categoryFrequencies == null)
categoryFrequencies = initializeCategoryFrequencies(row);
else
@@ -206,7 +240,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
* @return The array contains not null values for handled indices.
*/
@NotNull private Map<String, Integer>[] initializeCategoryFrequencies(LabeledVector row) {
- Map<String, Integer>[] categoryFrequencies = new HashMap[row.size()];
+ Map<String, Integer>[] categoryFrequencies = new Map[row.size()];
for (int i = 0; i < categoryFrequencies.length; i++)
if (handledIndices.contains(i))
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
index 79e216c..2a35958 100644
--- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
@@ -27,5 +27,8 @@ public enum EncoderType {
ONE_HOT_ENCODER,
/** String encoder. */
- STRING_ENCODER
+ STRING_ENCODER,
+
+ /** Frequency encoder. */
+ FREQUENCY_ENCODER
}
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java
new file mode 100644
index 0000000..533581e
--- /dev/null
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.encoding.frequency;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.ignite.ml.math.exceptions.preprocessing.UnknownCategorialFeatureValue;
+import org.apache.ignite.ml.math.primitives.vector.VectorUtils;
+import org.apache.ignite.ml.preprocessing.Preprocessor;
+import org.apache.ignite.ml.preprocessing.encoding.EncoderPreprocessor;
+import org.apache.ignite.ml.structures.LabeledVector;
+
+/**
+ * Preprocessing function that makes Frequency encoding.
+ *
+ * The Frequency Encoder Preprocessor encodes string values (categories) to double values
+ * in range [0.0, 1], where the value will be presented as a fraction of all the labels.
+ * <p>
+ * This preprocessor can transform multiple columns which indices are handled during training process.
+ * These indexes could be defined via .withEncodedFeature(featureIndex) call.
+ * </p>
+ * <p>
+ * NOTE: it doesn’t add new column but change data in-place.
+ * </p>
+ *
+ * @param <K> Type of a key in {@code upstream} data.
+ * @param <V> Type of a value in {@code upstream} data.
+ */
+public class FrequencyEncoderPreprocessor<K, V> extends EncoderPreprocessor<K, V> {
+ /** */
+ protected static final long serialVersionUID = 6237711236382623488L;
+
+ /** Filling values. */
+ protected final Map<String, Double>[] encodingFrequencies;
+
+ /**
+ * Constructs a new instance of Frequency Encoder preprocessor.
+ *
+ * @param basePreprocessor Base preprocessor.
+ * @param handledIndices Handled indices.
+ */
+ public FrequencyEncoderPreprocessor(Map<String, Double>[] encodingFrequencies,
+ Preprocessor<K, V> basePreprocessor, Set<Integer> handledIndices) {
+ super(null, basePreprocessor, handledIndices);
+ this.encodingFrequencies = encodingFrequencies;
+ }
+
+ /**
+ * Applies this preprocessor.
+ *
+ * @param k Key.
+ * @param v Value.
+ * @return Preprocessed row.
+ */
+ @Override public LabeledVector apply(K k, V v) {
+ LabeledVector tmp = basePreprocessor.apply(k, v);
+ double[] res = new double[tmp.size()];
+
+ for (int i = 0; i < res.length; i++) {
+ Object tmpObj = tmp.getRaw(i);
+ if (handledIndices.contains(i)) {
+ if (tmpObj.equals(Double.NaN) && encodingFrequencies[i].containsKey(KEY_FOR_NULL_VALUES))
+ res[i] = encodingValues[i].get(KEY_FOR_NULL_VALUES);
+ else if (encodingFrequencies[i].containsKey(tmpObj))
+ res[i] = encodingFrequencies[i].get(tmpObj);
+ else
+ throw new UnknownCategorialFeatureValue(tmpObj.toString());
+ }
+ else
+ res[i] = (double)tmpObj;
+ }
+ return new LabeledVector(VectorUtils.of(res), tmp.label());
+ }
+}
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java
similarity index 73%
copy from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
copy to modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java
index 79e216c..2168750 100644
--- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java
@@ -15,17 +15,8 @@
* limitations under the License.
*/
-package org.apache.ignite.ml.preprocessing.encoding;
-
/**
- * Describes Encoder preprocessor types to define resulting model in EncoderTrainer.
- *
- * @see EncoderTrainer
+ * <!-- Package description. -->
+ * Contains frequency encoding preprocessor.
*/
-public enum EncoderType {
- /** One hot encoder. */
- ONE_HOT_ENCODER,
-
- /** String encoder. */
- STRING_ENCODER
-}
+package org.apache.ignite.ml.preprocessing.encoding.frequency;
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java
index 7b3d5fc..1822704 100644
--- a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java
+++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java
@@ -19,9 +19,10 @@ package org.apache.ignite.ml.preprocessing;
import org.apache.ignite.ml.preprocessing.binarization.BinarizationPreprocessorTest;
import org.apache.ignite.ml.preprocessing.binarization.BinarizationTrainerTest;
+import org.apache.ignite.ml.preprocessing.encoding.EncoderTrainerTest;
+import org.apache.ignite.ml.preprocessing.encoding.FrequencyEncoderPreprocessorTest;
import org.apache.ignite.ml.preprocessing.encoding.OneHotEncoderPreprocessorTest;
import org.apache.ignite.ml.preprocessing.encoding.StringEncoderPreprocessorTest;
-import org.apache.ignite.ml.preprocessing.encoding.EncoderTrainerTest;
import org.apache.ignite.ml.preprocessing.imputing.ImputerPreprocessorTest;
import org.apache.ignite.ml.preprocessing.imputing.ImputerTrainerTest;
import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerPreprocessorTest;
@@ -44,6 +45,7 @@ import org.junit.runners.Suite;
ImputerTrainerTest.class,
EncoderTrainerTest.class,
OneHotEncoderPreprocessorTest.class,
+ FrequencyEncoderPreprocessorTest.class,
StringEncoderPreprocessorTest.class,
NormalizationTrainerTest.class,
NormalizationPreprocessorTest.class
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java
index 6fb760e..bc75647 100644
--- a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java
+++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java
@@ -161,4 +161,35 @@ public class EncoderTrainerTest extends TrainerTest {
assertArrayEquals(new double[] {2.0, 0.0}, preprocessor.apply(7, new DenseVector(new Serializable[]{"Monday", "September"})).features().asArray(), 1e-8);
}
+
+ /** Tests {@code fit()} method. */
+ @Test
+ public void testFitOnStringCategorialFeaturesWithFrequencyEncoding() {
+ Map<Integer, Vector> data = new HashMap<>();
+ data.put(1, new DenseVector(new Serializable[] {"Monday", "September"}));
+ data.put(2, new DenseVector(new Serializable[] {"Monday", "August"}));
+ data.put(3, new DenseVector(new Serializable[] {"Monday", "August"}));
+ data.put(4, new DenseVector(new Serializable[] {"Friday", "June"}));
+ data.put(5, new DenseVector(new Serializable[] {"Friday", "June"}));
+ data.put(6, new DenseVector(new Serializable[] {"Sunday", "August"}));
+
+ final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<>(0, 1);
+
+ DatasetBuilder<Integer, Vector> datasetBuilder = new LocalDatasetBuilder<>(data, parts);
+
+ EncoderTrainer<Integer, Vector> strEncoderTrainer = new EncoderTrainer<Integer, Vector>()
+ .withEncoderType(EncoderType.FREQUENCY_ENCODER)
+ .withEncodedFeature(0)
+ .withEncodedFeature(1);
+
+ EncoderPreprocessor<Integer, Vector> preprocessor = strEncoderTrainer.fit(
+ TestUtils.testEnvBuilder(),
+ datasetBuilder,
+ vectorizer
+ );
+
+ assertArrayEquals(new double[] {0.5, 0.166}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Monday", "September"})).features().asArray(), 0.1);
+ assertArrayEquals(new double[] {0.33, 0.5}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Friday", "August"})).features().asArray(), 0.1);
+ assertArrayEquals(new double[] {0.166, 0.33}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Sunday", "June"})).features().asArray(), 0.1);
+ }
}
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java
new file mode 100644
index 0000000..4d9d6d1
--- /dev/null
+++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.encoding;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.HashSet;
+import org.apache.ignite.ml.dataset.feature.extractor.Vectorizer;
+import org.apache.ignite.ml.dataset.feature.extractor.impl.DummyVectorizer;
+import org.apache.ignite.ml.math.primitives.vector.Vector;
+import org.apache.ignite.ml.math.primitives.vector.impl.DenseVector;
+import org.apache.ignite.ml.preprocessing.encoding.frequency.FrequencyEncoderPreprocessor;
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+
+/**
+ * Tests for {@link FrequencyEncoderPreprocessor}.
+ */
+public class FrequencyEncoderPreprocessorTest {
+ /** Tests {@code apply()} method. */
+ @Test
+ public void testApply() {
+ Vector[] data = new Vector[] {
+ new DenseVector(new Serializable[] {"1", "Moscow", "A"}),
+ new DenseVector(new Serializable[] {"2", "Moscow", "B"}),
+ new DenseVector(new Serializable[] {"2", "Moscow", "B"}),
+ };
+
+ Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<>(0, 1, 2);
+
+ FrequencyEncoderPreprocessor<Integer, Vector> preprocessor = new FrequencyEncoderPreprocessor<Integer, Vector>(
+ new HashMap[] {
+ new HashMap() {
+ {
+ put("1", 0.33);
+ put("2", 0.66);
+ }
+ }, new HashMap() {
+ {
+ put("Moscow", 1.0);
+ }
+ }, new HashMap() {
+ {
+ put("A", 0.33);
+ put("B", 0.66);
+ }
+ }},
+ vectorizer,
+ new HashSet() {
+ {
+ add(0);
+ add(1);
+ add(2);
+ }
+ });
+
+ double[][] postProcessedData = new double[][] {
+ {0.33, 1.0, 0.33},
+ {0.66, 1.0, 0.66},
+ {0.66, 1.0, 0.66},
+ };
+
+ for (int i = 0; i < data.length; i++)
+ assertArrayEquals(postProcessedData[i], preprocessor.apply(i, data[i]).features().asArray(), 0.1);
+ }
+}