You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ignite.apache.org by za...@apache.org on 2019/08/16 13:22:04 UTC

[ignite] branch master updated: IGNITE-10697: [ML] Add Frequency Encoding (#6784)

This is an automated email from the ASF dual-hosted git repository.

zaleslaw pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ignite.git


The following commit(s) were added to refs/heads/master by this push:
     new fb2e1e2  IGNITE-10697: [ML] Add Frequency Encoding (#6784)
fb2e1e2 is described below

commit fb2e1e28818d3d96a7a9fcc86043d28f28e76e47
Author: Alexey Zinoviev <za...@gmail.com>
AuthorDate: Fri Aug 16 16:21:55 2019 +0300

    IGNITE-10697: [ML] Add Frequency Encoding (#6784)
---
 .../ml/preprocessing/encoding/EncoderTrainer.java  | 70 ++++++++++++-----
 .../ml/preprocessing/encoding/EncoderType.java     |  5 +-
 .../frequency/FrequencyEncoderPreprocessor.java    | 89 ++++++++++++++++++++++
 .../package-info.java}                             | 15 +---
 .../ml/preprocessing/PreprocessingTestSuite.java   |  4 +-
 .../preprocessing/encoding/EncoderTrainerTest.java | 31 ++++++++
 .../encoding/FrequencyEncoderPreprocessorTest.java | 82 ++++++++++++++++++++
 7 files changed, 264 insertions(+), 32 deletions(-)

diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java
index 2e6442d..5703ea0 100644
--- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java
@@ -32,6 +32,7 @@ import org.apache.ignite.ml.dataset.primitive.context.EmptyContext;
 import org.apache.ignite.ml.environment.LearningEnvironmentBuilder;
 import org.apache.ignite.ml.preprocessing.PreprocessingTrainer;
 import org.apache.ignite.ml.preprocessing.Preprocessor;
+import org.apache.ignite.ml.preprocessing.encoding.frequency.FrequencyEncoderPreprocessor;
 import org.apache.ignite.ml.preprocessing.encoding.onehotencoder.OneHotEncoderPreprocessor;
 import org.apache.ignite.ml.preprocessing.encoding.stringencoder.StringEncoderPreprocessor;
 import org.apache.ignite.ml.structures.LabeledVector;
@@ -71,19 +72,19 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
                 while (upstream.hasNext()) {
                     UpstreamEntry<K, V> entity = upstream.next();
                     LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
-                    categoryFrequencies = calculateFrequencies(row, categoryFrequencies);
+                    categoryFrequencies = updateFrequenciesForNextRow(row, categoryFrequencies);
                 }
                 return new EncoderPartitionData()
                     .withCategoryFrequencies(categoryFrequencies);
             }
         )) {
-            Map<String, Integer>[] encodingValues = calculateEncodingValuesByFrequencies(dataset);
-
             switch (encoderType) {
                 case ONE_HOT_ENCODER:
-                    return new OneHotEncoderPreprocessor<>(encodingValues, basePreprocessor, handledIndices);
+                    return new OneHotEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
                 case STRING_ENCODER:
-                    return new StringEncoderPreprocessor<>(encodingValues, basePreprocessor, handledIndices);
+                    return new StringEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
+                case FREQUENCY_ENCODER:
+                    return new FrequencyEncoderPreprocessor<>(calculateEncodingFrequencies(dataset), basePreprocessor, handledIndices);
                 default:
                     throw new IllegalStateException("Define the type of the resulting prerocessor.");
             }
@@ -94,14 +95,38 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
     }
 
     /**
-     * Calculates the encoding values values by frequencies keeping in the given dataset.
+     * Calculates encoding frequencies as frequency divided on amount of rows in dataset.
      *
-     * @param dataset The dataset of frequencies for each feature aggregated in each partition.
-     * @return Encoding values for each feature.
+     * NOTE: The amount of rows is calculated as sum of absolute frequencies.
+     *
+     * @param dataset Dataset.
+     * @return Encoding frequency for each feature.
      */
-    private Map<String, Integer>[] calculateEncodingValuesByFrequencies(
-        Dataset<EmptyContext, EncoderPartitionData> dataset) {
-        Map<String, Integer>[] frequencies = dataset.compute(
+    private Map<String, Double>[] calculateEncodingFrequencies(Dataset<EmptyContext, EncoderPartitionData> dataset) {
+        Map<String, Integer>[] frequencies = calculateFrequencies(dataset);
+
+        Map<String, Double>[] res = new Map[frequencies.length];
+
+        int[] counters = new int[frequencies.length];
+
+        for (int i = 0; i < frequencies.length; i++) {
+            counters[i] = frequencies[i].values().stream().reduce(0, Integer::sum);
+            int locI = i;
+            res[locI] = new HashMap<>();
+            frequencies[i].forEach((k, v) -> res[locI].put(k, (double)v / counters[locI]));
+        }
+
+        return res;
+    }
+
+    /**
+     * Calculates frequencies for each feature.
+     *
+     * @param dataset Dataset.
+     * @return Frequency for each feature.
+     */
+    private Map<String, Integer>[] calculateFrequencies(Dataset<EmptyContext, EncoderPartitionData> dataset) {
+        return dataset.compute(
             EncoderPartitionData::categoryFrequencies,
             (a, b) -> {
                 if (a == null)
@@ -121,8 +146,19 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
                 return b;
             }
         );
+    }
+
+    /**
+     * Calculates the encoding values values by frequencies keeping in the given dataset.
+     *
+     * @param dataset The dataset of frequencies for each feature aggregated in each partition.
+     * @return Encoding values for each feature.
+     */
+    private Map<String, Integer>[] calculateEncodingValuesByFrequencies(
+        Dataset<EmptyContext, EncoderPartitionData> dataset) {
+        Map<String, Integer>[] frequencies = calculateFrequencies(dataset);
 
-        Map<String, Integer>[] res = new HashMap[frequencies.length];
+        Map<String, Integer>[] res = new Map[frequencies.length];
 
         for (int i = 0; i < frequencies.length; i++)
             if (handledIndices.contains(i))
@@ -140,10 +176,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
     private Map<String, Integer> transformFrequenciesToEncodingValues(Map<String, Integer> frequencies) {
         Comparator<Map.Entry<String, Integer>> comp;
 
-        if (encoderSortingStgy.equals(EncoderSortingStrategy.FREQUENCY_DESC))
-            comp = Map.Entry.comparingByValue();
-        else
-            comp = Collections.reverseOrder(Map.Entry.comparingByValue());
+        comp = encoderSortingStgy == EncoderSortingStrategy.FREQUENCY_DESC ? Map.Entry.comparingByValue() : Collections.reverseOrder(Map.Entry.comparingByValue());
 
         final HashMap<String, Integer> resMap = frequencies.entrySet()
             .stream()
@@ -166,7 +199,8 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
      * @param categoryFrequencies Holds the frequencies of categories by values and features.
      * @return Updated frequencies by values and features.
      */
-    private Map<String, Integer>[] calculateFrequencies(LabeledVector row, Map<String, Integer>[] categoryFrequencies) {
+    private Map<String, Integer>[] updateFrequenciesForNextRow(LabeledVector row,
+        Map<String, Integer>[] categoryFrequencies) {
         if (categoryFrequencies == null)
             categoryFrequencies = initializeCategoryFrequencies(row);
         else
@@ -206,7 +240,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> {
      * @return The array contains not null values for handled indices.
      */
     @NotNull private Map<String, Integer>[] initializeCategoryFrequencies(LabeledVector row) {
-        Map<String, Integer>[] categoryFrequencies = new HashMap[row.size()];
+        Map<String, Integer>[] categoryFrequencies = new Map[row.size()];
 
         for (int i = 0; i < categoryFrequencies.length; i++)
             if (handledIndices.contains(i))
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
index 79e216c..2a35958 100644
--- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
@@ -27,5 +27,8 @@ public enum EncoderType {
     ONE_HOT_ENCODER,
 
     /** String encoder. */
-    STRING_ENCODER
+    STRING_ENCODER,
+
+    /** Frequency encoder. */
+    FREQUENCY_ENCODER
 }
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java
new file mode 100644
index 0000000..533581e
--- /dev/null
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.encoding.frequency;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.ignite.ml.math.exceptions.preprocessing.UnknownCategorialFeatureValue;
+import org.apache.ignite.ml.math.primitives.vector.VectorUtils;
+import org.apache.ignite.ml.preprocessing.Preprocessor;
+import org.apache.ignite.ml.preprocessing.encoding.EncoderPreprocessor;
+import org.apache.ignite.ml.structures.LabeledVector;
+
+/**
+ * Preprocessing function that makes Frequency encoding.
+ *
+ * The Frequency Encoder Preprocessor encodes string values (categories) to double values
+ * in range [0.0, 1], where the value will be presented as a fraction of all the labels.
+ * <p>
+ * This preprocessor can transform multiple columns which indices are handled during training process.
+ * These indexes could be defined via .withEncodedFeature(featureIndex) call.
+ * </p>
+ * <p>
+ * NOTE: it doesn’t add new column but change data in-place.
+ * </p>
+ *
+ * @param <K> Type of a key in {@code upstream} data.
+ * @param <V> Type of a value in {@code upstream} data.
+ */
+public class FrequencyEncoderPreprocessor<K, V> extends EncoderPreprocessor<K, V> {
+    /** */
+    protected static final long serialVersionUID = 6237711236382623488L;
+
+    /** Filling values. */
+    protected final Map<String, Double>[] encodingFrequencies;
+
+    /**
+     * Constructs a new instance of Frequency Encoder preprocessor.
+     *
+     * @param basePreprocessor Base preprocessor.
+     * @param handledIndices Handled indices.
+     */
+    public FrequencyEncoderPreprocessor(Map<String, Double>[] encodingFrequencies,
+        Preprocessor<K, V> basePreprocessor, Set<Integer> handledIndices) {
+        super(null, basePreprocessor, handledIndices);
+        this.encodingFrequencies = encodingFrequencies;
+    }
+
+    /**
+     * Applies this preprocessor.
+     *
+     * @param k Key.
+     * @param v Value.
+     * @return Preprocessed row.
+     */
+    @Override public LabeledVector apply(K k, V v) {
+        LabeledVector tmp = basePreprocessor.apply(k, v);
+        double[] res = new double[tmp.size()];
+
+        for (int i = 0; i < res.length; i++) {
+            Object tmpObj = tmp.getRaw(i);
+            if (handledIndices.contains(i)) {
+                if (tmpObj.equals(Double.NaN) && encodingFrequencies[i].containsKey(KEY_FOR_NULL_VALUES))
+                    res[i] = encodingValues[i].get(KEY_FOR_NULL_VALUES);
+                else if (encodingFrequencies[i].containsKey(tmpObj))
+                    res[i] = encodingFrequencies[i].get(tmpObj);
+                else
+                    throw new UnknownCategorialFeatureValue(tmpObj.toString());
+            }
+            else
+                res[i] = (double)tmpObj;
+        }
+        return new LabeledVector(VectorUtils.of(res), tmp.label());
+    }
+}
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java
similarity index 73%
copy from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
copy to modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java
index 79e216c..2168750 100644
--- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java
+++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java
@@ -15,17 +15,8 @@
  * limitations under the License.
  */
 
-package org.apache.ignite.ml.preprocessing.encoding;
-
 /**
- * Describes Encoder preprocessor types to define resulting model in EncoderTrainer.
- *
- * @see EncoderTrainer
+ * <!-- Package description. -->
+ * Contains frequency encoding preprocessor.
  */
-public enum EncoderType {
-    /** One hot encoder. */
-    ONE_HOT_ENCODER,
-
-    /** String encoder. */
-    STRING_ENCODER
-}
+package org.apache.ignite.ml.preprocessing.encoding.frequency;
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java
index 7b3d5fc..1822704 100644
--- a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java
+++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java
@@ -19,9 +19,10 @@ package org.apache.ignite.ml.preprocessing;
 
 import org.apache.ignite.ml.preprocessing.binarization.BinarizationPreprocessorTest;
 import org.apache.ignite.ml.preprocessing.binarization.BinarizationTrainerTest;
+import org.apache.ignite.ml.preprocessing.encoding.EncoderTrainerTest;
+import org.apache.ignite.ml.preprocessing.encoding.FrequencyEncoderPreprocessorTest;
 import org.apache.ignite.ml.preprocessing.encoding.OneHotEncoderPreprocessorTest;
 import org.apache.ignite.ml.preprocessing.encoding.StringEncoderPreprocessorTest;
-import org.apache.ignite.ml.preprocessing.encoding.EncoderTrainerTest;
 import org.apache.ignite.ml.preprocessing.imputing.ImputerPreprocessorTest;
 import org.apache.ignite.ml.preprocessing.imputing.ImputerTrainerTest;
 import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerPreprocessorTest;
@@ -44,6 +45,7 @@ import org.junit.runners.Suite;
     ImputerTrainerTest.class,
     EncoderTrainerTest.class,
     OneHotEncoderPreprocessorTest.class,
+    FrequencyEncoderPreprocessorTest.class,
     StringEncoderPreprocessorTest.class,
     NormalizationTrainerTest.class,
     NormalizationPreprocessorTest.class
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java
index 6fb760e..bc75647 100644
--- a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java
+++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java
@@ -161,4 +161,35 @@ public class EncoderTrainerTest extends TrainerTest {
 
         assertArrayEquals(new double[] {2.0, 0.0}, preprocessor.apply(7, new DenseVector(new Serializable[]{"Monday", "September"})).features().asArray(), 1e-8);
     }
+
+    /** Tests {@code fit()} method. */
+    @Test
+    public void testFitOnStringCategorialFeaturesWithFrequencyEncoding() {
+        Map<Integer, Vector> data = new HashMap<>();
+        data.put(1, new DenseVector(new Serializable[] {"Monday", "September"}));
+        data.put(2, new DenseVector(new Serializable[] {"Monday", "August"}));
+        data.put(3, new DenseVector(new Serializable[] {"Monday", "August"}));
+        data.put(4, new DenseVector(new Serializable[] {"Friday", "June"}));
+        data.put(5, new DenseVector(new Serializable[] {"Friday", "June"}));
+        data.put(6, new DenseVector(new Serializable[] {"Sunday", "August"}));
+
+        final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<>(0, 1);
+
+        DatasetBuilder<Integer, Vector> datasetBuilder = new LocalDatasetBuilder<>(data, parts);
+
+        EncoderTrainer<Integer, Vector> strEncoderTrainer = new EncoderTrainer<Integer, Vector>()
+            .withEncoderType(EncoderType.FREQUENCY_ENCODER)
+            .withEncodedFeature(0)
+            .withEncodedFeature(1);
+
+        EncoderPreprocessor<Integer, Vector> preprocessor = strEncoderTrainer.fit(
+            TestUtils.testEnvBuilder(),
+            datasetBuilder,
+            vectorizer
+        );
+
+        assertArrayEquals(new double[] {0.5, 0.166}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Monday", "September"})).features().asArray(), 0.1);
+        assertArrayEquals(new double[] {0.33, 0.5}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Friday", "August"})).features().asArray(), 0.1);
+        assertArrayEquals(new double[] {0.166, 0.33}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Sunday", "June"})).features().asArray(), 0.1);
+    }
 }
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java
new file mode 100644
index 0000000..4d9d6d1
--- /dev/null
+++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.encoding;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.HashSet;
+import org.apache.ignite.ml.dataset.feature.extractor.Vectorizer;
+import org.apache.ignite.ml.dataset.feature.extractor.impl.DummyVectorizer;
+import org.apache.ignite.ml.math.primitives.vector.Vector;
+import org.apache.ignite.ml.math.primitives.vector.impl.DenseVector;
+import org.apache.ignite.ml.preprocessing.encoding.frequency.FrequencyEncoderPreprocessor;
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+
+/**
+ * Tests for {@link FrequencyEncoderPreprocessor}.
+ */
+public class FrequencyEncoderPreprocessorTest {
+    /** Tests {@code apply()} method. */
+    @Test
+    public void testApply() {
+        Vector[] data = new Vector[] {
+            new DenseVector(new Serializable[] {"1", "Moscow", "A"}),
+            new DenseVector(new Serializable[] {"2", "Moscow", "B"}),
+            new DenseVector(new Serializable[] {"2", "Moscow", "B"}),
+        };
+
+        Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<>(0, 1, 2);
+
+        FrequencyEncoderPreprocessor<Integer, Vector> preprocessor = new FrequencyEncoderPreprocessor<Integer, Vector>(
+            new HashMap[] {
+                new HashMap() {
+                    {
+                        put("1", 0.33);
+                        put("2", 0.66);
+                    }
+                }, new HashMap() {
+                {
+                    put("Moscow", 1.0);
+                }
+            }, new HashMap() {
+                {
+                    put("A", 0.33);
+                    put("B", 0.66);
+                }
+            }},
+            vectorizer,
+            new HashSet() {
+                {
+                    add(0);
+                    add(1);
+                    add(2);
+                }
+            });
+
+        double[][] postProcessedData = new double[][] {
+            {0.33, 1.0, 0.33},
+            {0.66, 1.0, 0.66},
+            {0.66, 1.0, 0.66},
+        };
+
+        for (int i = 0; i < data.length; i++)
+            assertArrayEquals(postProcessedData[i], preprocessor.apply(i, data[i]).features().asArray(), 0.1);
+    }
+}