You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@rya.apache.org by ca...@apache.org on 2017/08/21 20:40:35 UTC

[2/3] incubator-rya git commit: RYA-250 Added data duplication detection methods to Smart URI/Entities. These use configured tolerances for each data type to decide if an Entity is considered nearly equal. Also, string terms that are considered equival

http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
new file mode 100644
index 0000000..220db30
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
@@ -0,0 +1,1066 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import static java.util.Objects.requireNonNull;
+
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.commons.configuration.ConfigurationException;
+import org.apache.commons.lang.StringUtils;
+import org.apache.rya.api.domain.RyaType;
+import org.apache.rya.api.domain.RyaURI;
+import org.apache.rya.api.resolver.impl.DateTimeRyaTypeResolver;
+import org.apache.rya.indexing.entity.model.Entity;
+import org.apache.rya.indexing.entity.model.Property;
+import org.apache.rya.indexing.smarturi.SmartUriAdapter;
+import org.apache.rya.indexing.smarturi.SmartUriException;
+import org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig;
+import org.calrissian.mango.types.exception.TypeEncodingException;
+import org.joda.time.DateTime;
+import org.openrdf.model.URI;
+import org.openrdf.model.impl.URIImpl;
+import org.openrdf.model.vocabulary.XMLSchema;
+
+import com.google.common.collect.ImmutableMap;
+
+/**
+ * Detects if two entities contain data that's nearly identical based on a set
+ * tolerance for each field's type. Two entities are considered nearly
+ * identical if all their properties are equal and/or within the specified
+ * tolerance for the property's object type. Setting all object type tolerances
+ * to 0 means that the objects need to be exactly equal to each other to be
+ * considered duplicates. Duplicate data detection can be enabled/disabled
+ * through configuration and each object type can have a tolerance based on
+ * either the difference or the percentage difference between the objects being
+ * compared.
+ */
+public class DuplicateDataDetector {
+    private final Map<URI, ApproxEqualsDetector<?>> uriMap = new HashMap<>();
+    private final Map<Class<?>, ApproxEqualsDetector<?>> classMap = new HashMap<>();
+
+    private boolean isDetectionEnabled;
+
+    /**
+     * Creates a new instance of {@link DuplicateDataDetector} with the
+     * values provided by the configuration file.
+     * @param duplicateDataConfig the {@link DuplicateDataConfig}
+     */
+    public DuplicateDataDetector(final DuplicateDataConfig duplicateDataConfig) {
+        this(duplicateDataConfig.getBooleanTolerance(),
+            duplicateDataConfig.getByteTolerance(),
+            duplicateDataConfig.getDateTolerance(),
+            duplicateDataConfig.getDoubleTolerance(),
+            duplicateDataConfig.getFloatTolerance(),
+            duplicateDataConfig.getIntegerTolerance(),
+            duplicateDataConfig.getLongTolerance(),
+            duplicateDataConfig.getShortTolerance(),
+            duplicateDataConfig.getStringTolerance(),
+            duplicateDataConfig.getUriTolerance(),
+            duplicateDataConfig.getEquivalentTermsMap(),
+            duplicateDataConfig.isDetectionEnabled()
+        );
+    }
+
+    /**
+     * Creates a new instance of {@link DuplicateDataDetector} with the values
+     * from the config.
+     * @throws ConfigurationException
+     */
+    public DuplicateDataDetector() throws ConfigurationException {
+        this(new DuplicateDataConfig());
+    }
+
+    /**
+     * Creates a new instance of {@link DuplicateDataDetector}.
+     * @param tolerance the tolerance to assign to all types.
+     */
+    public DuplicateDataDetector(final double tolerance) {
+        this(new Tolerance(tolerance, ToleranceType.DIFFERENCE), new LinkedHashMap<>());
+    }
+
+    /**
+     * Creates a new instance of {@link DuplicateDataDetector}.
+     * @param tolerance the tolerance to assign to all types.
+     * @param equivalentTermsMap the {@link Map} of terms that are considered
+     * equivalent to each other. (not {@code null})
+     */
+    public DuplicateDataDetector(final Tolerance tolerance, final Map<String, List<String>> equivalentTermsMap) {
+        this(tolerance, tolerance, tolerance, tolerance, tolerance,
+            tolerance, tolerance, tolerance, tolerance, tolerance , equivalentTermsMap, true);
+    }
+
+    /**
+     * Creates a new instance of {@link DuplicateDataDetector}.
+     * @param booleanTolerance the {@link Boolean} tolerance value or
+     * {@code null} if not specified.
+     * @param byteTolerance the {@link Byte} tolerance value or {@code null} if
+     * not specified.
+     * @param dateTolerance the {@link Date} tolerance value or {@code null} if
+     * not specified.
+     * @param doubleTolerance the {@link Double} tolerance value or {@code null}
+     * if not specified.
+     * @param floatTolerance the {@link Float} tolerance value or {@code null}
+     * if not specified.
+     * @param integerTolerance the {@link Integer} tolerance value or
+     * {@code null} if not specified.
+     * @param longTolerance the {@link Long} tolerance value or {@code null} if
+     * not specified.
+     * @param shortTolerance the {@link Short} tolerance value or {@code null}
+     * if not specified.
+     * @param stringTolerance the {@link String} tolerance value or {@code null}
+     * if not specified.
+     * @param uriTolerance the {@link URI} tolerance value or {@code null} if
+     * not specified.
+     * @param equivalentTermsMap the {@link Map} of terms that are considered
+     * equivalent to each other. (not {@code null})
+     * @param isDetectionEnabled {@code true} to enable detection. {@code false}
+     * to disable detection.
+     */
+    public DuplicateDataDetector(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+            final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+            final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+            final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+            final boolean isDetectionEnabled)
+    {
+        init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance,
+            integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
+    }
+
+    private void init(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+            final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+            final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+            final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+            final boolean isDetectionEnabled)
+    {
+        final List<ApproxEqualsDetector<?>> detectors = new ArrayList<>();
+        detectors.add(new BooleanApproxEqualsDetector(booleanTolerance));
+        detectors.add(new ByteApproxEqualsDetector(byteTolerance));
+        detectors.add(new DateApproxEqualsDetector(dateTolerance));
+        detectors.add(new DateTimeApproxEqualsDetector(dateTolerance));
+        detectors.add(new DoubleApproxEqualsDetector(doubleTolerance));
+        detectors.add(new FloatApproxEqualsDetector(floatTolerance));
+        detectors.add(new IntegerApproxEqualsDetector(integerTolerance));
+        detectors.add(new LongApproxEqualsDetector(longTolerance));
+        detectors.add(new ShortApproxEqualsDetector(shortTolerance));
+        detectors.add(new StringApproxEqualsDetector(stringTolerance, equivalentTermsMap));
+        detectors.add(new UriApproxEqualsDetector(uriTolerance));
+
+        for (final ApproxEqualsDetector<?> approxEqualsDetector : detectors) {
+            uriMap.put(approxEqualsDetector.getXmlSchemaUri(), approxEqualsDetector);
+            classMap.put(approxEqualsDetector.getTypeClass(), approxEqualsDetector);
+        }
+
+        this.isDetectionEnabled = isDetectionEnabled;
+    }
+
+    /**
+     * @return {@code true} to enable detection. {@code false} to disable
+     * detection.
+     */
+    public boolean isDetectionEnabled() {
+        return isDetectionEnabled;
+    }
+
+    /**
+     * Removes any duplicate (nearly identical) entities from the collection
+     * of entities.
+     * @param entities the {@link List} of {@link Entity}s. (not {@code null})
+     * @throws SmartUriException
+     */
+    public void removeDuplicatesFromCollection(final List<Entity> entities) throws SmartUriException {
+        requireNonNull(entities);
+        // Use a Sorted Set in reverse order to hold the indices
+        final Set<Integer> indicesToRemove = new TreeSet<>((a, b) -> Integer.compare(b, a));
+        if (entities != null && entities.size() > 1) {
+            // Compare all entities to each other while avoiding making the
+            // same comparisons again and not comparing an entity to itself.
+            for (int i = 0; i < entities.size() - 1; i++) {
+                final Entity entity1 = entities.get(i);
+                for (int j = entities.size() - 1; j > i; j--) {
+                    final Entity entity2 = entities.get(j);
+                    final boolean areDuplicates = compareEntities(entity1, entity2);
+                    if (areDuplicates) {
+                        indicesToRemove.add(j);
+                    }
+                }
+            }
+        }
+        if (!indicesToRemove.isEmpty()) {
+            // Remove indices in reverse order (already sorted in descending
+            // order so just loop through them)
+            for (final int index : indicesToRemove) {
+                entities.remove(index);
+            }
+        }
+    }
+
+    /**
+     * Compares two Smart URI's to determine if they have nearly identical data.
+     * @param uri1 the first Smart {@link URI}. (not {@code null})
+     * @param uri2 the second Smart {@link URI}. (not {@code null})
+     * @return {@code true} if the two Smart URI's have nearly identical data.
+     * {@code false} otherwise.
+     * @throws SmartUriException
+     */
+    public boolean compareSmartUris(final URI uri1, final URI uri2) throws SmartUriException {
+        requireNonNull(uri1);
+        requireNonNull(uri2);
+        final Entity entity1 = SmartUriAdapter.deserializeUriEntity(uri1);
+        final Entity entity2 = SmartUriAdapter.deserializeUriEntity(uri2);
+        return compareEntities(entity1, entity2);
+    }
+
+    /**
+     * Compares two entities to determine if they have nearly identical data.
+     * @param entity1 the first {@link Entity}. (not {@code null})
+     * @param entity2 the second {@link Entity}. (not {@code null})
+     * @return {@code true} if the two entities have nearly identical data.
+     * {@code false} otherwise.
+     * @throws SmartUriException
+     */
+    public boolean compareEntities(final Entity entity1, final Entity entity2) throws SmartUriException {
+        requireNonNull(entity1);
+        requireNonNull(entity2);
+        boolean allValuesNearlyEqual = true;
+
+        final List<RyaURI> types1 = entity1.getExplicitTypeIds();
+        final List<RyaURI> types2 = entity2.getExplicitTypeIds();
+        final boolean doBothHaveSameTypes = types1.containsAll(types2);
+        if (!doBothHaveSameTypes) {
+            return false;
+        }
+        for (final Entry<RyaURI, ImmutableMap<RyaURI, Property>> entry : entity1.getProperties().entrySet()) {
+            final RyaURI typeIdUri = entry.getKey();
+            for (final Entry<RyaURI, Property> typeProperty : entry.getValue().entrySet()) {
+                final RyaURI propertyNameUri = typeProperty.getKey();
+                final Property property1 = typeProperty.getValue();
+
+                final Optional<Property> p2 = entity2.lookupTypeProperty(typeIdUri, propertyNameUri);
+                if (p2.isPresent()) {
+                    final Property property2 = p2.get();
+                    final RyaType value1 = property1.getValue();
+                    final RyaType value2 = property2.getValue();
+                    final String data1 = value1.getData();
+                    final String data2 = value2.getData();
+                    final URI xmlSchemaUri1 = value1.getDataType();
+                    final ApproxEqualsDetector<?> approxEqualsDetector = uriMap.get(xmlSchemaUri1);
+                    if (approxEqualsDetector == null) {
+                        throw new SmartUriException("No appropriate detector found for the type: " + xmlSchemaUri1);
+                    }
+                    final boolean approxEquals = approxEqualsDetector.areApproxEquals(data1, data2);
+                    if (!approxEquals) {
+                        allValuesNearlyEqual = false;
+                        break;
+                    }
+                } else {
+                    allValuesNearlyEqual = false;
+                    break;
+                }
+            }
+            if (!allValuesNearlyEqual) {
+                break;
+            }
+        }
+        return allValuesNearlyEqual;
+    }
+
+    /**
+     * Gets the appropriate {@link ApproxEqualsDetector} for the specified
+     * class.
+     * @param clazz the {@link Class} to find an {@link ApproxEqualsDetector}
+     * for.
+     * @return the {@link ApproxEqualsDetector} for the class or {@code null} if
+     * none could be found.
+     */
+    public ApproxEqualsDetector<?> getDetectorForType(final Class<?> clazz) {
+        return classMap.get(clazz);
+    }
+
+    private static boolean isOnlyOneNull(final Object lhs, final Object rhs) {
+        return (lhs == null && rhs != null) || (lhs != null && rhs == null);
+    }
+
+    /**
+     * Class to detect if two booleans are considered approximately equal to
+     * each other.
+     */
+    public static class BooleanApproxEqualsDetector implements ApproxEqualsDetector<Boolean> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link BooleanApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public BooleanApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Boolean lhs, final Boolean rhs) {
+            // Should never be almost equals when tolerance is 0, only exactly equals
+            // Otherwise if there's any tolerance specified everything is equal
+            return tolerance.getValue() == 0 ? Objects.equals(lhs, rhs) : true;
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Boolean convertStringToObject(final String string) throws SmartUriException {
+            return Boolean.valueOf(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Boolean.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.BOOLEAN;
+        }
+    }
+
+    /**
+     * Class to detect if two bytes are considered approximately equal to each
+     * other.
+     */
+    public static class ByteApproxEqualsDetector implements ApproxEqualsDetector<Byte> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link ByteApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public ByteApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Byte lhs, final Byte rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Check based on tolerance
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhs == 0) {
+                        return lhs == rhs;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return Math.abs(lhs - rhs) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Byte convertStringToObject(final String string) throws SmartUriException {
+            return Byte.valueOf(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Byte.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.BYTE;
+        }
+    }
+
+    /**
+     * Class to detect if two dates are considered approximately equal to each
+     * other.
+     */
+    public static class DateApproxEqualsDetector implements ApproxEqualsDetector<Date> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(500.0, ToleranceType.DIFFERENCE); // milliseconds
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link DateApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public DateApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Date lhs, final Date rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Check based on tolerance
+            final long lhsTime = lhs.getTime();
+            final long rhsTime = rhs.getTime();
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhsTime == 0) {
+                        return lhsTime == rhsTime;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)Math.abs(lhsTime - rhsTime) / lhsTime) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return Math.abs(lhsTime - rhsTime) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Date convertStringToObject(final String string) throws SmartUriException {
+            DateTime dateTime = null;
+            try {
+                dateTime = DateTime.parse(string, DateTimeRyaTypeResolver.XMLDATETIME_PARSER);
+            } catch (final TypeEncodingException e) {
+                throw new SmartUriException("Exception occurred serializing data[" + string + "]", e);
+            }
+            final Date date = dateTime.toDate();
+            return date;
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Date.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.DATE;
+        }
+    }
+
+    /**
+     * Class to detect if two datetimes are considered approximately equal to
+     * each other.
+     */
+    public static class DateTimeApproxEqualsDetector implements ApproxEqualsDetector<DateTime> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(500.0, ToleranceType.DIFFERENCE); // milliseconds
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link DateTimeApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public DateTimeApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final DateTime lhs, final DateTime rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Check based on tolerance
+            final long lhsTime = lhs.getMillis();
+            final long rhsTime = rhs.getMillis();
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhsTime == 0) {
+                        return lhsTime == rhsTime;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)Math.abs(lhsTime - rhsTime) / lhsTime) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return Math.abs(lhsTime - rhsTime) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public DateTime convertStringToObject(final String string) throws SmartUriException {
+            DateTime dateTime = null;
+            try {
+                dateTime = DateTime.parse(string, DateTimeRyaTypeResolver.XMLDATETIME_PARSER);
+            } catch (final TypeEncodingException e) {
+                throw new SmartUriException("Exception occurred serializing data[" + string + "]", e);
+            }
+            return dateTime;
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return DateTime.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.DATETIME;
+        }
+    }
+
+    /**
+     * Class to detect if two doubles are considered approximately equal to each
+     * other.
+     */
+    public static class DoubleApproxEqualsDetector implements ApproxEqualsDetector<Double> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0001, ToleranceType.PERCENTAGE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link DoubleApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public DoubleApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Double lhs, final Double rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Doubles can be unpredictable with how they store a value
+            // like 0.1. So use BigDecimal with its String constructor
+            // to make things more predictable.
+            final BigDecimal lhsBd = new BigDecimal(String.valueOf(lhs));
+            final BigDecimal rhsBd = new BigDecimal(String.valueOf(rhs));
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhs == 0) {
+                        return lhs == rhs;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    final BigDecimal absDiff = lhsBd.subtract(rhsBd).abs();
+                    try {
+                        final BigDecimal percent = absDiff.divide(lhsBd);
+                        return percent.doubleValue() <= tolerance.getValue();
+                    } catch (final ArithmeticException e) {
+                        // BigDecimal quotient did not have a terminating
+                        // decimal expansion. So, try without BigDecimal.
+                        return (Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+                    }
+                case DIFFERENCE:
+                default:
+                    final BigDecimal absDiff1 = lhsBd.subtract(rhsBd).abs();
+                    return absDiff1.doubleValue() <= tolerance.getValue();
+                    //return Math.abs(lhs - rhs) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Double convertStringToObject(final String string) throws SmartUriException {
+            return Double.valueOf(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Double.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.DOUBLE;
+        }
+    }
+
+    /**
+     * Class to detect if two floats are considered approximately equal to each
+     * other.
+     */
+    public static class FloatApproxEqualsDetector implements ApproxEqualsDetector<Float> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0001, ToleranceType.PERCENTAGE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link FloatApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public FloatApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Float lhs, final Float rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Check based on tolerance
+            // Floats can be unpredictable with how they store a value
+            // like 0.1. So use BigDecimal with its String constructor
+            // to make things more predictable.
+            final BigDecimal lhsBd = new BigDecimal(String.valueOf(lhs));
+            final BigDecimal rhsBd = new BigDecimal(String.valueOf(rhs));
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhs == 0) {
+                        return lhs == rhs;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    final BigDecimal absDiff = lhsBd.subtract(rhsBd).abs();
+                    try {
+                        final BigDecimal percent = absDiff.divide(lhsBd);
+                        return percent.floatValue() <= tolerance.getValue();
+                    } catch (final ArithmeticException e) {
+                        // BigDecimal quotient did not have a terminating
+                        // decimal expansion. So, try without BigDecimal.
+                        return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+                    }
+                case DIFFERENCE:
+                default:
+                    final BigDecimal absDiff1 = lhsBd.subtract(rhsBd).abs();
+                    return absDiff1.floatValue() <= tolerance.getValue();
+                    //return Math.abs(lhs - rhs) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Float convertStringToObject(final String string) throws SmartUriException {
+            return Float.valueOf(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Float.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.FLOAT;
+        }
+    }
+
+    /**
+     * Class to detect if two integers are considered approximately equal to
+     * each other.
+     */
+    public static class IntegerApproxEqualsDetector implements ApproxEqualsDetector<Integer> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link IntegerApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public IntegerApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Integer lhs, final Integer rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Check based on tolerance
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhs == 0) {
+                        return lhs == rhs;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return Math.abs(lhs - rhs) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Integer convertStringToObject(final String string) throws SmartUriException {
+            return Integer.valueOf(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Integer.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.INTEGER;
+        }
+    }
+
+    /**
+     * Class to detect if two longs are considered approximately equal to
+     * each other.
+     */
+    public static class LongApproxEqualsDetector implements ApproxEqualsDetector<Long> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link LongApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public LongApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Long lhs, final Long rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Check based on tolerance
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhs == 0) {
+                        return lhs == rhs;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return Math.abs(lhs - rhs) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Long convertStringToObject(final String string) throws SmartUriException {
+            return Long.valueOf(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Long.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.LONG;
+        }
+    }
+
+    /**
+     * Class to detect if two shorts are considered approximately equal to each
+     * other.
+     */
+    public static class ShortApproxEqualsDetector implements ApproxEqualsDetector<Short> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link ShortApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public ShortApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final Short lhs, final Short rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            // Check based on tolerance
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhs == 0) {
+                        return lhs == rhs;
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return Math.abs(lhs - rhs) <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public Short convertStringToObject(final String string) throws SmartUriException {
+            return Short.valueOf(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return Short.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.SHORT;
+        }
+    }
+
+    /**
+     * Class to detect if two string are considered approximately equal to each
+     * other.
+     */
+    public static class StringApproxEqualsDetector implements ApproxEqualsDetector<String> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.05, ToleranceType.PERCENTAGE);
+        private final Tolerance tolerance;
+        private final Map<String, List<String>> equivalentTermsMap;
+
+        /**
+         * Creates a new instance of {@link StringApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public StringApproxEqualsDetector(final Tolerance tolerance, final Map<String, List<String>> equivalentTermsMap) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+            this.equivalentTermsMap = equivalentTermsMap;
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final String lhs, final String rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (StringUtils.equalsIgnoreCase(lhs, rhs)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+
+            // Only check one-way. Terms are not bi-directionally equivalent
+            // unless specified.
+            final List<String> lhsTermEquivalents = equivalentTermsMap.get(lhs);
+            if (lhsTermEquivalents != null && lhsTermEquivalents.contains(rhs)) {
+                return true;
+            }
+            final int distance = StringUtils.getLevenshteinDistance(lhs, rhs);
+            // Check based on tolerance
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (lhs.length() == 0) {
+                        return lhs.length() == rhs.length();
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)distance / lhs.length()) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return distance <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public String convertStringToObject(final String string) throws SmartUriException {
+            return string;
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return String.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.STRING;
+        }
+    }
+
+    /**
+     * Class to detect if two URIs are considered approximately equal to each
+     * other.
+     */
+    public static class UriApproxEqualsDetector implements ApproxEqualsDetector<URI> {
+        private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+        private final Tolerance tolerance;
+
+        /**
+         * Creates a new instance of {@link UriApproxEqualsDetector}.
+         * @param tolerance the {@link Tolerance}.
+         */
+        public UriApproxEqualsDetector(final Tolerance tolerance) {
+            this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+        }
+
+        @Override
+        public boolean areObjectsApproxEquals(final URI lhs, final URI rhs) {
+            if (isOnlyOneNull(lhs, rhs)) {
+                return false;
+            }
+            if (Objects.equals(lhs, rhs)) {
+                return true;
+            }
+            final String uriString1 = lhs.stringValue();
+            final String uriString2 = rhs.stringValue();
+            if (StringUtils.equalsIgnoreCase(uriString1, uriString2)) {
+                // They're exactly equals so get out
+                return true;
+            } else if (tolerance.getValue() == 0) {
+                // If they're not exactly equals with zero tolerance then get out
+                return false;
+            }
+            final int distance = StringUtils.getLevenshteinDistance(uriString1, uriString2);
+            // Check based on tolerance
+            switch (tolerance.getToleranceType()) {
+                case PERCENTAGE:
+                    if (uriString1.length() == 0) {
+                        return uriString1.length() == uriString2.length();
+                    }
+                    if (tolerance.getValue() >= 1) {
+                        return true;
+                    }
+                    return ((double)distance / uriString1.length()) <= tolerance.getValue();
+                case DIFFERENCE:
+                default:
+                    return distance <= tolerance.getValue();
+            }
+        }
+
+        @Override
+        public Tolerance getDefaultTolerance() {
+            return DEFAULT_TOLERANCE;
+        }
+
+        @Override
+        public URI convertStringToObject(final String string) throws SmartUriException {
+            return new URIImpl(string);
+        }
+
+        @Override
+        public Class<?> getTypeClass() {
+            return URI.class;
+        }
+
+        @Override
+        public URI getXmlSchemaUri() {
+            return XMLSchema.ANYURI;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java
new file mode 100644
index 0000000..8bdf54f
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import org.apache.rya.indexing.entity.model.Entity;
+import org.apache.rya.indexing.entity.storage.EntityStorage.EntityStorageException;
+
+/**
+ * An {@link Entity} could not be created because another entity is a nearly
+ * identical duplicate based on the configured tolerances.
+ */
+public class EntityNearDuplicateException extends EntityStorageException {
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * Creates a new instance of {@link EntityNearDuplicateException}.
+     * @param message the message to be displayed by the exception.
+     */
+    public EntityNearDuplicateException(final String message) {
+        super(message);
+    }
+
+    /**
+     * Creates a new instance of {@link EntityNearDuplicateException}.
+     * @param message the message to be displayed by the exception.
+     * @param throwable the source {#link Throwable} cause of the exception.
+     */
+    public EntityNearDuplicateException(final String message, final Throwable throwable) {
+        super(message, throwable);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java
new file mode 100644
index 0000000..772522c
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import static java.util.Objects.requireNonNull;
+
+import java.text.NumberFormat;
+
+/**
+ * The types of methods available to use for calculating tolerance.
+ */
+public class Tolerance {
+    private final Double value;
+    private final ToleranceType toleranceType;
+
+    /**
+     * Creates a new instance of {@link Tolerance}.
+     * @param value the tolerance value. (not {@code null})
+     * @param toleranceType the {@link ToleranceType}. (not {@code null})
+     */
+    public Tolerance(final Double value, final ToleranceType toleranceType) {
+        this.value = requireNonNull(value);
+        this.toleranceType = requireNonNull(toleranceType);
+    }
+
+    /**
+     * @return the tolerance value.
+     */
+    public Double getValue() {
+        return value;
+    }
+
+    /**
+     * @return the {@link ToleranceType}.
+     */
+    public ToleranceType getToleranceType() {
+        return toleranceType;
+    }
+
+    @Override
+    public String toString() {
+        switch (toleranceType) {
+            case PERCENTAGE:
+                return NumberFormat.getPercentInstance().format(value);
+            case DIFFERENCE:
+                return value.toString();
+            default:
+                return "Unknown Tolerance Type with value: " + value.toString();
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java
new file mode 100644
index 0000000..29faff1
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * The types of methods available to use for calculating tolerance.
+ */
+public enum ToleranceType {
+    /**
+     * Indicates that the difference between two values must be within the
+     * specified tolerance value to be accepted.
+     */
+    DIFFERENCE,
+    /**
+     * Indicates that the difference between two values divided by the original
+     * value must fall within the specified tolerance percentage value to be
+     * accepted.
+     */
+    PERCENTAGE;
+
+    /**
+     * Returns the {@link ToleranceType} that matches the specified name.
+     * @param name the name to find.
+     * @return the {@link ToleranceType} or {@code null} if none could be found.
+     */
+    public static ToleranceType getToleranceTypeByName(final String name) {
+        if (StringUtils.isNotBlank(name)) {
+            return ToleranceType.valueOf(name);
+        }
+        return null;
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java
new file mode 100644
index 0000000..98f65c7
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication.conf;
+
+import static java.util.Objects.requireNonNull;
+
+import java.text.NumberFormat;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.configuration.ConfigurationException;
+import org.apache.commons.configuration.XMLConfiguration;
+import org.apache.rya.indexing.smarturi.duplication.Tolerance;
+import org.apache.rya.indexing.smarturi.duplication.ToleranceType;
+
+/**
+ * Configuration options for data duplication.
+ */
+public class DuplicateDataConfig {
+    public static final String DEFAULT_CONFIG_FILE_PATH = "conf/duplicate_data_detection_config.xml";
+
+    private Tolerance booleanTolerance;
+    private Tolerance byteTolerance;
+    private Tolerance dateTolerance;
+    private Tolerance doubleTolerance;
+    private Tolerance floatTolerance;
+    private Tolerance integerTolerance;
+    private Tolerance longTolerance;
+    private Tolerance shortTolerance;
+    private Tolerance stringTolerance;
+    private Tolerance uriTolerance;
+
+    private Map<String, List<String>> equivalentTermsMap;
+
+    private boolean isDetectionEnabled;
+
+    /**
+     * Creates a new instance of {@link DuplicateDataConfig}.
+     * @throws ConfigurationException
+     */
+    public DuplicateDataConfig() throws ConfigurationException {
+        this(new XMLConfiguration(DEFAULT_CONFIG_FILE_PATH));
+    }
+
+    /**
+     * Creates a new instance of {@link DuplicateDataConfig}.
+     * @param xmlFilePath the config's XML file path. (not {@code null})
+     * @throws ConfigurationException
+     */
+    public DuplicateDataConfig(final String xmlFileLocation) throws ConfigurationException {
+        this(new XMLConfiguration(requireNonNull(xmlFileLocation)));
+    }
+
+    /**
+     * Creates a new instance of {@link DuplicateDataConfig}.
+     * @param xmlConfig the {@link XMLConfiguration} file. (not {@code null})
+     * @throws ConfigurationException
+     */
+    public DuplicateDataConfig(final XMLConfiguration xmlConfig) throws ConfigurationException {
+        requireNonNull(xmlConfig);
+
+        final Tolerance booleanTolerance = parseTolerance("tolerances.booleanTolerance", xmlConfig);
+        final Tolerance byteTolerance = parseTolerance("tolerances.byteTolerance", xmlConfig);
+        final Tolerance dateTolerance = parseTolerance("tolerances.dateTolerance", xmlConfig);
+        final Tolerance doubleTolerance = parseTolerance("tolerances.doubleTolerance", xmlConfig);
+        final Tolerance floatTolerance = parseTolerance("tolerances.floatTolerance", xmlConfig);
+        final Tolerance integerTolerance = parseTolerance("tolerances.integerTolerance", xmlConfig);
+        final Tolerance longTolerance = parseTolerance("tolerances.longTolerance", xmlConfig);
+        final Tolerance shortTolerance = parseTolerance("tolerances.shortTolerance", xmlConfig);
+        final Tolerance stringTolerance = parseTolerance("tolerances.stringTolerance", xmlConfig);
+        final Tolerance uriTolerance = parseTolerance("tolerances.uriTolerance", xmlConfig);
+
+        final Map<String, List<String>> equivalentTermsMap = parseEquivalentTermsMap(xmlConfig);
+
+        final boolean isDetectionEnabled = xmlConfig.getBoolean("enableDetection", false);
+        init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
+    }
+
+    /**
+     * Creates a new instance of {@link DuplicateDataConfig}.
+     * @param booleanTolerance the {@link Boolean} tolerance value or
+     * {@code null} if not specified.
+     * @param byteTolerance the {@link Byte} tolerance value or {@code null} if
+     * not specified.
+     * @param dateTolerance the {@link Date} tolerance value or {@code null} if
+     * not specified.
+     * @param doubleTolerance the {@link Double} tolerance value or {@code null}
+     * if not specified.
+     * @param floatTolerance the {@link Float} tolerance value or {@code null}
+     * if not specified.
+     * @param integerTolerance the {@link Integer} tolerance value or
+     * {@code null} if not specified.
+     * @param longTolerance the {@link Long} tolerance value or {@code null} if
+     * not specified.
+     * @param shortTolerance the {@link Short} tolerance value or {@code null}
+     * if not specified.
+     * @param stringTolerance the {@link String} tolerance value or {@code null}
+     * if not specified.
+     * @param uriTolerance the {@link URI} tolerance value or {@code null} if
+     * not specified.
+     * @param equivalentTermsMap the {@link Map} of terms that are considered
+     * equivalent to each other. (not {@code null})
+     * @param isDetectionEnabled {@code true} to enable detection. {@code false}
+     * to disable detection.
+     */
+    public DuplicateDataConfig(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+        final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+        final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+        final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+        final boolean isDetectionEnabled)
+    {
+        init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
+    }
+
+    private void init(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+        final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+        final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+        final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+        final boolean isDetectionEnabled)
+    {
+        this.booleanTolerance = booleanTolerance;
+        this.byteTolerance = byteTolerance;
+        this.dateTolerance= dateTolerance;
+        this.doubleTolerance = doubleTolerance;
+        this.floatTolerance = floatTolerance;
+        this.integerTolerance = integerTolerance;
+        this.longTolerance = longTolerance;
+        this.shortTolerance = shortTolerance;
+        this.stringTolerance = stringTolerance;
+        this.uriTolerance = uriTolerance;
+        this.equivalentTermsMap = requireNonNull(equivalentTermsMap);
+        this.isDetectionEnabled = isDetectionEnabled;
+    }
+
+    private static Tolerance parseTolerance(final String key, final XMLConfiguration xmlConfig) throws ConfigurationException {
+        final String type = xmlConfig.getString(key + ".type", null);
+        final ToleranceType toleranceType = ToleranceType.getToleranceTypeByName(type);
+        Double doubleValue = null;
+        if (toleranceType != null) {
+            switch (toleranceType) {
+                case PERCENTAGE:
+                    final String value = xmlConfig.getString(key + ".value", null);
+                    if (value != null && value.contains("%")) {
+                        try {
+                            final Number number = NumberFormat.getPercentInstance().parse(value);
+                            doubleValue = number.doubleValue();
+                        } catch (final ParseException e) {
+                            throw new ConfigurationException(e);
+                        }
+                    } else {
+                        doubleValue = xmlConfig.getDouble(key + ".value", null);
+                    }
+                    if (doubleValue != null) {
+                        if (doubleValue < 0) {
+                            throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" must be a positive value. Found this value: " + doubleValue);
+                        }
+                        if (doubleValue > 1) {
+                            throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" can NOT be greater than 100%. Found this value: " + doubleValue);
+                        }
+                    }
+                    break;
+                case DIFFERENCE:
+                    doubleValue = xmlConfig.getDouble(key + ".value", null);
+                    if (doubleValue != null && doubleValue < 0) {
+                        throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" must be a positive value. Found this value: " + doubleValue);
+                    }
+                    break;
+                default:
+                    throw new ConfigurationException("Unknown Tolerance Type specified in config for <" + type + ">: " + toleranceType);
+            }
+            if (doubleValue != null) {
+                return new Tolerance(doubleValue, toleranceType);
+            }
+        }
+        return null;
+    }
+
+    private static Map<String, List<String>> parseEquivalentTermsMap(final XMLConfiguration xmlConfig) {
+        final Map<String, List<String>> equivalentTermsMap = new LinkedHashMap<>();
+        final Object prop = xmlConfig.getProperty("termMappings.termMapping.term");
+        if (prop != null) {
+            if (prop instanceof Collection) {
+                final int size = ((Collection<?>) prop).size();
+                for (int i = 0; i < size; i++) {
+                    final String termElement = "termMappings.termMapping(" + i + ")";
+                    parseTermMapping(termElement, xmlConfig, equivalentTermsMap);
+                }
+            } else {
+                final String termElement = "termMappings.termMapping";
+                parseTermMapping(termElement, xmlConfig, equivalentTermsMap);
+            }
+        }
+        return equivalentTermsMap;
+    }
+
+    private static void parseTermMapping(final String termElement, final XMLConfiguration xmlConfig, final Map<String, List<String>> equivalentTermsMap) {
+        final String term = xmlConfig.getString(termElement + ".term");
+        final Object equivalentProp = xmlConfig.getString(termElement + ".equivalents.equivalent");
+        if (equivalentProp instanceof Collection) {
+            final int equivalentSize = ((Collection<?>) equivalentProp).size();
+            if (term != null && equivalentSize > 1) {
+                final List<String> equivalents = new ArrayList<>();
+                for (int j = 0; j < equivalentSize; j++) {
+                    final String equivalent = xmlConfig.getString(termElement + ".equivalents.equivalent(" + j + ")");
+                    if (equivalent != null) {
+                        equivalents.add(equivalent);
+                    }
+                }
+                equivalentTermsMap.put(term, equivalents);
+            }
+        } else {
+            final List<String> equivalents = new ArrayList<>();
+            final String equivalent = xmlConfig.getString(termElement + ".equivalents.equivalent");
+            if (equivalent != null) {
+                equivalents.add(equivalent);
+                if (term != null) {
+                    equivalentTermsMap.put(term, equivalents);
+                }
+            }
+        }
+    }
+
+    /**
+     * @return the {@link Boolean} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getBooleanTolerance() {
+        return booleanTolerance;
+    }
+
+    /**
+     * @return the {@link Byte} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getByteTolerance() {
+        return byteTolerance;
+    }
+
+    /**
+     * @return the {@link Date} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getDateTolerance() {
+        return dateTolerance;
+    }
+
+    /**
+     * @return the {@link Double} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getDoubleTolerance() {
+        return doubleTolerance;
+    }
+
+    /**
+     * @return the {@link Float} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getFloatTolerance() {
+        return floatTolerance;
+    }
+
+    /**
+     * @return the {@link Integer} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getIntegerTolerance() {
+        return integerTolerance;
+    }
+
+    /**
+     * @return the {@link Long} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getLongTolerance() {
+        return longTolerance;
+    }
+
+    /**
+     * @return the {@link Short} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getShortTolerance() {
+        return shortTolerance;
+    }
+
+    /**
+     * @return the {@link String} tolerance value or {@code null} if not
+     * specified.
+     */
+    public Tolerance getStringTolerance() {
+        return stringTolerance;
+    }
+
+    /**
+     * @return the {@link URI} tolerance value or {@code null} if not specified.
+     */
+    public Tolerance getUriTolerance() {
+        return uriTolerance;
+    }
+
+    /**
+     * @return the {@link Map} of terms that are considered equivalent to each
+     * other.
+     */
+    public Map<String, List<String>> getEquivalentTermsMap() {
+        return equivalentTermsMap;
+    }
+
+    /**
+     * @return {@code true} to enable detection. {@code false} to disable
+     * detection.
+     */
+    public boolean isDetectionEnabled() {
+        return isDetectionEnabled;
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java b/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
index 60efbed..dff271f 100644
--- a/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
+++ b/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
@@ -245,7 +245,6 @@ public class MongoDbSmartUriTest {
         final Entity resultEntity = SmartUriAdapter.deserializeUriEntity(smartUri);
         System.out.println(resultEntity);
         assertEquals(BOB_ENTITY.getSubject(), resultEntity.getSubject());
-        //assertTrue(Paths.get(BOB_ENTITY.getSubject().getData()).equals(Paths.get(resultEntity.getSubject().getData())));
     }
 
     @Test