You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@rya.apache.org by ca...@apache.org on 2017/08/21 20:40:35 UTC
[2/3] incubator-rya git commit: RYA-250 Added data duplication
detection methods to Smart URI/Entities. These use configured tolerances for
each data type to decide if an Entity is considered nearly equal. Also,
string terms that are considered equival
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
new file mode 100644
index 0000000..220db30
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
@@ -0,0 +1,1066 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import static java.util.Objects.requireNonNull;
+
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.commons.configuration.ConfigurationException;
+import org.apache.commons.lang.StringUtils;
+import org.apache.rya.api.domain.RyaType;
+import org.apache.rya.api.domain.RyaURI;
+import org.apache.rya.api.resolver.impl.DateTimeRyaTypeResolver;
+import org.apache.rya.indexing.entity.model.Entity;
+import org.apache.rya.indexing.entity.model.Property;
+import org.apache.rya.indexing.smarturi.SmartUriAdapter;
+import org.apache.rya.indexing.smarturi.SmartUriException;
+import org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig;
+import org.calrissian.mango.types.exception.TypeEncodingException;
+import org.joda.time.DateTime;
+import org.openrdf.model.URI;
+import org.openrdf.model.impl.URIImpl;
+import org.openrdf.model.vocabulary.XMLSchema;
+
+import com.google.common.collect.ImmutableMap;
+
+/**
+ * Detects if two entities contain data that's nearly identical based on a set
+ * tolerance for each field's type. Two entities are considered nearly
+ * identical if all their properties are equal and/or within the specified
+ * tolerance for the property's object type. Setting all object type tolerances
+ * to 0 means that the objects need to be exactly equal to each other to be
+ * considered duplicates. Duplicate data detection can be enabled/disabled
+ * through configuration and each object type can have a tolerance based on
+ * either the difference or the percentage difference between the objects being
+ * compared.
+ */
+public class DuplicateDataDetector {
+ private final Map<URI, ApproxEqualsDetector<?>> uriMap = new HashMap<>();
+ private final Map<Class<?>, ApproxEqualsDetector<?>> classMap = new HashMap<>();
+
+ private boolean isDetectionEnabled;
+
+ /**
+ * Creates a new instance of {@link DuplicateDataDetector} with the
+ * values provided by the configuration file.
+ * @param duplicateDataConfig the {@link DuplicateDataConfig}
+ */
+ public DuplicateDataDetector(final DuplicateDataConfig duplicateDataConfig) {
+ this(duplicateDataConfig.getBooleanTolerance(),
+ duplicateDataConfig.getByteTolerance(),
+ duplicateDataConfig.getDateTolerance(),
+ duplicateDataConfig.getDoubleTolerance(),
+ duplicateDataConfig.getFloatTolerance(),
+ duplicateDataConfig.getIntegerTolerance(),
+ duplicateDataConfig.getLongTolerance(),
+ duplicateDataConfig.getShortTolerance(),
+ duplicateDataConfig.getStringTolerance(),
+ duplicateDataConfig.getUriTolerance(),
+ duplicateDataConfig.getEquivalentTermsMap(),
+ duplicateDataConfig.isDetectionEnabled()
+ );
+ }
+
+ /**
+ * Creates a new instance of {@link DuplicateDataDetector} with the values
+ * from the config.
+ * @throws ConfigurationException
+ */
+ public DuplicateDataDetector() throws ConfigurationException {
+ this(new DuplicateDataConfig());
+ }
+
+ /**
+ * Creates a new instance of {@link DuplicateDataDetector}.
+ * @param tolerance the tolerance to assign to all types.
+ */
+ public DuplicateDataDetector(final double tolerance) {
+ this(new Tolerance(tolerance, ToleranceType.DIFFERENCE), new LinkedHashMap<>());
+ }
+
+ /**
+ * Creates a new instance of {@link DuplicateDataDetector}.
+ * @param tolerance the tolerance to assign to all types.
+ * @param equivalentTermsMap the {@link Map} of terms that are considered
+ * equivalent to each other. (not {@code null})
+ */
+ public DuplicateDataDetector(final Tolerance tolerance, final Map<String, List<String>> equivalentTermsMap) {
+ this(tolerance, tolerance, tolerance, tolerance, tolerance,
+ tolerance, tolerance, tolerance, tolerance, tolerance , equivalentTermsMap, true);
+ }
+
+ /**
+ * Creates a new instance of {@link DuplicateDataDetector}.
+ * @param booleanTolerance the {@link Boolean} tolerance value or
+ * {@code null} if not specified.
+ * @param byteTolerance the {@link Byte} tolerance value or {@code null} if
+ * not specified.
+ * @param dateTolerance the {@link Date} tolerance value or {@code null} if
+ * not specified.
+ * @param doubleTolerance the {@link Double} tolerance value or {@code null}
+ * if not specified.
+ * @param floatTolerance the {@link Float} tolerance value or {@code null}
+ * if not specified.
+ * @param integerTolerance the {@link Integer} tolerance value or
+ * {@code null} if not specified.
+ * @param longTolerance the {@link Long} tolerance value or {@code null} if
+ * not specified.
+ * @param shortTolerance the {@link Short} tolerance value or {@code null}
+ * if not specified.
+ * @param stringTolerance the {@link String} tolerance value or {@code null}
+ * if not specified.
+ * @param uriTolerance the {@link URI} tolerance value or {@code null} if
+ * not specified.
+ * @param equivalentTermsMap the {@link Map} of terms that are considered
+ * equivalent to each other. (not {@code null})
+ * @param isDetectionEnabled {@code true} to enable detection. {@code false}
+ * to disable detection.
+ */
+ public DuplicateDataDetector(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+ final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+ final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+ final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+ final boolean isDetectionEnabled)
+ {
+ init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance,
+ integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
+ }
+
+ private void init(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+ final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+ final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+ final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+ final boolean isDetectionEnabled)
+ {
+ final List<ApproxEqualsDetector<?>> detectors = new ArrayList<>();
+ detectors.add(new BooleanApproxEqualsDetector(booleanTolerance));
+ detectors.add(new ByteApproxEqualsDetector(byteTolerance));
+ detectors.add(new DateApproxEqualsDetector(dateTolerance));
+ detectors.add(new DateTimeApproxEqualsDetector(dateTolerance));
+ detectors.add(new DoubleApproxEqualsDetector(doubleTolerance));
+ detectors.add(new FloatApproxEqualsDetector(floatTolerance));
+ detectors.add(new IntegerApproxEqualsDetector(integerTolerance));
+ detectors.add(new LongApproxEqualsDetector(longTolerance));
+ detectors.add(new ShortApproxEqualsDetector(shortTolerance));
+ detectors.add(new StringApproxEqualsDetector(stringTolerance, equivalentTermsMap));
+ detectors.add(new UriApproxEqualsDetector(uriTolerance));
+
+ for (final ApproxEqualsDetector<?> approxEqualsDetector : detectors) {
+ uriMap.put(approxEqualsDetector.getXmlSchemaUri(), approxEqualsDetector);
+ classMap.put(approxEqualsDetector.getTypeClass(), approxEqualsDetector);
+ }
+
+ this.isDetectionEnabled = isDetectionEnabled;
+ }
+
+ /**
+ * @return {@code true} to enable detection. {@code false} to disable
+ * detection.
+ */
+ public boolean isDetectionEnabled() {
+ return isDetectionEnabled;
+ }
+
+ /**
+ * Removes any duplicate (nearly identical) entities from the collection
+ * of entities.
+ * @param entities the {@link List} of {@link Entity}s. (not {@code null})
+ * @throws SmartUriException
+ */
+ public void removeDuplicatesFromCollection(final List<Entity> entities) throws SmartUriException {
+ requireNonNull(entities);
+ // Use a Sorted Set in reverse order to hold the indices
+ final Set<Integer> indicesToRemove = new TreeSet<>((a, b) -> Integer.compare(b, a));
+ if (entities != null && entities.size() > 1) {
+ // Compare all entities to each other while avoiding making the
+ // same comparisons again and not comparing an entity to itself.
+ for (int i = 0; i < entities.size() - 1; i++) {
+ final Entity entity1 = entities.get(i);
+ for (int j = entities.size() - 1; j > i; j--) {
+ final Entity entity2 = entities.get(j);
+ final boolean areDuplicates = compareEntities(entity1, entity2);
+ if (areDuplicates) {
+ indicesToRemove.add(j);
+ }
+ }
+ }
+ }
+ if (!indicesToRemove.isEmpty()) {
+ // Remove indices in reverse order (already sorted in descending
+ // order so just loop through them)
+ for (final int index : indicesToRemove) {
+ entities.remove(index);
+ }
+ }
+ }
+
+ /**
+ * Compares two Smart URI's to determine if they have nearly identical data.
+ * @param uri1 the first Smart {@link URI}. (not {@code null})
+ * @param uri2 the second Smart {@link URI}. (not {@code null})
+ * @return {@code true} if the two Smart URI's have nearly identical data.
+ * {@code false} otherwise.
+ * @throws SmartUriException
+ */
+ public boolean compareSmartUris(final URI uri1, final URI uri2) throws SmartUriException {
+ requireNonNull(uri1);
+ requireNonNull(uri2);
+ final Entity entity1 = SmartUriAdapter.deserializeUriEntity(uri1);
+ final Entity entity2 = SmartUriAdapter.deserializeUriEntity(uri2);
+ return compareEntities(entity1, entity2);
+ }
+
+ /**
+ * Compares two entities to determine if they have nearly identical data.
+ * @param entity1 the first {@link Entity}. (not {@code null})
+ * @param entity2 the second {@link Entity}. (not {@code null})
+ * @return {@code true} if the two entities have nearly identical data.
+ * {@code false} otherwise.
+ * @throws SmartUriException
+ */
+ public boolean compareEntities(final Entity entity1, final Entity entity2) throws SmartUriException {
+ requireNonNull(entity1);
+ requireNonNull(entity2);
+ boolean allValuesNearlyEqual = true;
+
+ final List<RyaURI> types1 = entity1.getExplicitTypeIds();
+ final List<RyaURI> types2 = entity2.getExplicitTypeIds();
+ final boolean doBothHaveSameTypes = types1.containsAll(types2);
+ if (!doBothHaveSameTypes) {
+ return false;
+ }
+ for (final Entry<RyaURI, ImmutableMap<RyaURI, Property>> entry : entity1.getProperties().entrySet()) {
+ final RyaURI typeIdUri = entry.getKey();
+ for (final Entry<RyaURI, Property> typeProperty : entry.getValue().entrySet()) {
+ final RyaURI propertyNameUri = typeProperty.getKey();
+ final Property property1 = typeProperty.getValue();
+
+ final Optional<Property> p2 = entity2.lookupTypeProperty(typeIdUri, propertyNameUri);
+ if (p2.isPresent()) {
+ final Property property2 = p2.get();
+ final RyaType value1 = property1.getValue();
+ final RyaType value2 = property2.getValue();
+ final String data1 = value1.getData();
+ final String data2 = value2.getData();
+ final URI xmlSchemaUri1 = value1.getDataType();
+ final ApproxEqualsDetector<?> approxEqualsDetector = uriMap.get(xmlSchemaUri1);
+ if (approxEqualsDetector == null) {
+ throw new SmartUriException("No appropriate detector found for the type: " + xmlSchemaUri1);
+ }
+ final boolean approxEquals = approxEqualsDetector.areApproxEquals(data1, data2);
+ if (!approxEquals) {
+ allValuesNearlyEqual = false;
+ break;
+ }
+ } else {
+ allValuesNearlyEqual = false;
+ break;
+ }
+ }
+ if (!allValuesNearlyEqual) {
+ break;
+ }
+ }
+ return allValuesNearlyEqual;
+ }
+
+ /**
+ * Gets the appropriate {@link ApproxEqualsDetector} for the specified
+ * class.
+ * @param clazz the {@link Class} to find an {@link ApproxEqualsDetector}
+ * for.
+ * @return the {@link ApproxEqualsDetector} for the class or {@code null} if
+ * none could be found.
+ */
+ public ApproxEqualsDetector<?> getDetectorForType(final Class<?> clazz) {
+ return classMap.get(clazz);
+ }
+
+ private static boolean isOnlyOneNull(final Object lhs, final Object rhs) {
+ return (lhs == null && rhs != null) || (lhs != null && rhs == null);
+ }
+
+ /**
+ * Class to detect if two booleans are considered approximately equal to
+ * each other.
+ */
+ public static class BooleanApproxEqualsDetector implements ApproxEqualsDetector<Boolean> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link BooleanApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public BooleanApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Boolean lhs, final Boolean rhs) {
+ // Should never be almost equals when tolerance is 0, only exactly equals
+ // Otherwise if there's any tolerance specified everything is equal
+ return tolerance.getValue() == 0 ? Objects.equals(lhs, rhs) : true;
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Boolean convertStringToObject(final String string) throws SmartUriException {
+ return Boolean.valueOf(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Boolean.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.BOOLEAN;
+ }
+ }
+
+ /**
+ * Class to detect if two bytes are considered approximately equal to each
+ * other.
+ */
+ public static class ByteApproxEqualsDetector implements ApproxEqualsDetector<Byte> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link ByteApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public ByteApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Byte lhs, final Byte rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Check based on tolerance
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhs == 0) {
+ return lhs == rhs;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return Math.abs(lhs - rhs) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Byte convertStringToObject(final String string) throws SmartUriException {
+ return Byte.valueOf(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Byte.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.BYTE;
+ }
+ }
+
+ /**
+ * Class to detect if two dates are considered approximately equal to each
+ * other.
+ */
+ public static class DateApproxEqualsDetector implements ApproxEqualsDetector<Date> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(500.0, ToleranceType.DIFFERENCE); // milliseconds
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link DateApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public DateApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Date lhs, final Date rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Check based on tolerance
+ final long lhsTime = lhs.getTime();
+ final long rhsTime = rhs.getTime();
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhsTime == 0) {
+ return lhsTime == rhsTime;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)Math.abs(lhsTime - rhsTime) / lhsTime) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return Math.abs(lhsTime - rhsTime) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Date convertStringToObject(final String string) throws SmartUriException {
+ DateTime dateTime = null;
+ try {
+ dateTime = DateTime.parse(string, DateTimeRyaTypeResolver.XMLDATETIME_PARSER);
+ } catch (final TypeEncodingException e) {
+ throw new SmartUriException("Exception occurred serializing data[" + string + "]", e);
+ }
+ final Date date = dateTime.toDate();
+ return date;
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Date.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.DATE;
+ }
+ }
+
+ /**
+ * Class to detect if two datetimes are considered approximately equal to
+ * each other.
+ */
+ public static class DateTimeApproxEqualsDetector implements ApproxEqualsDetector<DateTime> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(500.0, ToleranceType.DIFFERENCE); // milliseconds
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link DateTimeApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public DateTimeApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final DateTime lhs, final DateTime rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Check based on tolerance
+ final long lhsTime = lhs.getMillis();
+ final long rhsTime = rhs.getMillis();
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhsTime == 0) {
+ return lhsTime == rhsTime;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)Math.abs(lhsTime - rhsTime) / lhsTime) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return Math.abs(lhsTime - rhsTime) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public DateTime convertStringToObject(final String string) throws SmartUriException {
+ DateTime dateTime = null;
+ try {
+ dateTime = DateTime.parse(string, DateTimeRyaTypeResolver.XMLDATETIME_PARSER);
+ } catch (final TypeEncodingException e) {
+ throw new SmartUriException("Exception occurred serializing data[" + string + "]", e);
+ }
+ return dateTime;
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return DateTime.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.DATETIME;
+ }
+ }
+
+ /**
+ * Class to detect if two doubles are considered approximately equal to each
+ * other.
+ */
+ public static class DoubleApproxEqualsDetector implements ApproxEqualsDetector<Double> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0001, ToleranceType.PERCENTAGE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link DoubleApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public DoubleApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Double lhs, final Double rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Doubles can be unpredictable with how they store a value
+ // like 0.1. So use BigDecimal with its String constructor
+ // to make things more predictable.
+ final BigDecimal lhsBd = new BigDecimal(String.valueOf(lhs));
+ final BigDecimal rhsBd = new BigDecimal(String.valueOf(rhs));
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhs == 0) {
+ return lhs == rhs;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ final BigDecimal absDiff = lhsBd.subtract(rhsBd).abs();
+ try {
+ final BigDecimal percent = absDiff.divide(lhsBd);
+ return percent.doubleValue() <= tolerance.getValue();
+ } catch (final ArithmeticException e) {
+ // BigDecimal quotient did not have a terminating
+ // decimal expansion. So, try without BigDecimal.
+ return (Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+ }
+ case DIFFERENCE:
+ default:
+ final BigDecimal absDiff1 = lhsBd.subtract(rhsBd).abs();
+ return absDiff1.doubleValue() <= tolerance.getValue();
+ //return Math.abs(lhs - rhs) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Double convertStringToObject(final String string) throws SmartUriException {
+ return Double.valueOf(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Double.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.DOUBLE;
+ }
+ }
+
+ /**
+ * Class to detect if two floats are considered approximately equal to each
+ * other.
+ */
+ public static class FloatApproxEqualsDetector implements ApproxEqualsDetector<Float> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0001, ToleranceType.PERCENTAGE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link FloatApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public FloatApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Float lhs, final Float rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Check based on tolerance
+ // Floats can be unpredictable with how they store a value
+ // like 0.1. So use BigDecimal with its String constructor
+ // to make things more predictable.
+ final BigDecimal lhsBd = new BigDecimal(String.valueOf(lhs));
+ final BigDecimal rhsBd = new BigDecimal(String.valueOf(rhs));
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhs == 0) {
+ return lhs == rhs;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ final BigDecimal absDiff = lhsBd.subtract(rhsBd).abs();
+ try {
+ final BigDecimal percent = absDiff.divide(lhsBd);
+ return percent.floatValue() <= tolerance.getValue();
+ } catch (final ArithmeticException e) {
+ // BigDecimal quotient did not have a terminating
+ // decimal expansion. So, try without BigDecimal.
+ return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+ }
+ case DIFFERENCE:
+ default:
+ final BigDecimal absDiff1 = lhsBd.subtract(rhsBd).abs();
+ return absDiff1.floatValue() <= tolerance.getValue();
+ //return Math.abs(lhs - rhs) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Float convertStringToObject(final String string) throws SmartUriException {
+ return Float.valueOf(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Float.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.FLOAT;
+ }
+ }
+
+ /**
+ * Class to detect if two integers are considered approximately equal to
+ * each other.
+ */
+ public static class IntegerApproxEqualsDetector implements ApproxEqualsDetector<Integer> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link IntegerApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public IntegerApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Integer lhs, final Integer rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Check based on tolerance
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhs == 0) {
+ return lhs == rhs;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return Math.abs(lhs - rhs) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Integer convertStringToObject(final String string) throws SmartUriException {
+ return Integer.valueOf(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Integer.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.INTEGER;
+ }
+ }
+
+ /**
+ * Class to detect if two longs are considered approximately equal to
+ * each other.
+ */
+ public static class LongApproxEqualsDetector implements ApproxEqualsDetector<Long> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link LongApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public LongApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Long lhs, final Long rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Check based on tolerance
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhs == 0) {
+ return lhs == rhs;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return Math.abs(lhs - rhs) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Long convertStringToObject(final String string) throws SmartUriException {
+ return Long.valueOf(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Long.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.LONG;
+ }
+ }
+
+ /**
+ * Class to detect if two shorts are considered approximately equal to each
+ * other.
+ */
+ public static class ShortApproxEqualsDetector implements ApproxEqualsDetector<Short> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link ShortApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public ShortApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final Short lhs, final Short rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ // Check based on tolerance
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhs == 0) {
+ return lhs == rhs;
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return Math.abs(lhs - rhs) <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public Short convertStringToObject(final String string) throws SmartUriException {
+ return Short.valueOf(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return Short.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.SHORT;
+ }
+ }
+
+ /**
+ * Class to detect if two string are considered approximately equal to each
+ * other.
+ */
+ public static class StringApproxEqualsDetector implements ApproxEqualsDetector<String> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.05, ToleranceType.PERCENTAGE);
+ private final Tolerance tolerance;
+ private final Map<String, List<String>> equivalentTermsMap;
+
+ /**
+ * Creates a new instance of {@link StringApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public StringApproxEqualsDetector(final Tolerance tolerance, final Map<String, List<String>> equivalentTermsMap) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ this.equivalentTermsMap = equivalentTermsMap;
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final String lhs, final String rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (StringUtils.equalsIgnoreCase(lhs, rhs)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+
+ // Only check one-way. Terms are not bi-directionally equivalent
+ // unless specified.
+ final List<String> lhsTermEquivalents = equivalentTermsMap.get(lhs);
+ if (lhsTermEquivalents != null && lhsTermEquivalents.contains(rhs)) {
+ return true;
+ }
+ final int distance = StringUtils.getLevenshteinDistance(lhs, rhs);
+ // Check based on tolerance
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (lhs.length() == 0) {
+ return lhs.length() == rhs.length();
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)distance / lhs.length()) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return distance <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public String convertStringToObject(final String string) throws SmartUriException {
+ return string;
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return String.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.STRING;
+ }
+ }
+
+ /**
+ * Class to detect if two URIs are considered approximately equal to each
+ * other.
+ */
+ public static class UriApproxEqualsDetector implements ApproxEqualsDetector<URI> {
+ private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE);
+ private final Tolerance tolerance;
+
+ /**
+ * Creates a new instance of {@link UriApproxEqualsDetector}.
+ * @param tolerance the {@link Tolerance}.
+ */
+ public UriApproxEqualsDetector(final Tolerance tolerance) {
+ this.tolerance = tolerance != null ? tolerance : getDefaultTolerance();
+ }
+
+ @Override
+ public boolean areObjectsApproxEquals(final URI lhs, final URI rhs) {
+ if (isOnlyOneNull(lhs, rhs)) {
+ return false;
+ }
+ if (Objects.equals(lhs, rhs)) {
+ return true;
+ }
+ final String uriString1 = lhs.stringValue();
+ final String uriString2 = rhs.stringValue();
+ if (StringUtils.equalsIgnoreCase(uriString1, uriString2)) {
+ // They're exactly equals so get out
+ return true;
+ } else if (tolerance.getValue() == 0) {
+ // If they're not exactly equals with zero tolerance then get out
+ return false;
+ }
+ final int distance = StringUtils.getLevenshteinDistance(uriString1, uriString2);
+ // Check based on tolerance
+ switch (tolerance.getToleranceType()) {
+ case PERCENTAGE:
+ if (uriString1.length() == 0) {
+ return uriString1.length() == uriString2.length();
+ }
+ if (tolerance.getValue() >= 1) {
+ return true;
+ }
+ return ((double)distance / uriString1.length()) <= tolerance.getValue();
+ case DIFFERENCE:
+ default:
+ return distance <= tolerance.getValue();
+ }
+ }
+
+ @Override
+ public Tolerance getDefaultTolerance() {
+ return DEFAULT_TOLERANCE;
+ }
+
+ @Override
+ public URI convertStringToObject(final String string) throws SmartUriException {
+ return new URIImpl(string);
+ }
+
+ @Override
+ public Class<?> getTypeClass() {
+ return URI.class;
+ }
+
+ @Override
+ public URI getXmlSchemaUri() {
+ return XMLSchema.ANYURI;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java
new file mode 100644
index 0000000..8bdf54f
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import org.apache.rya.indexing.entity.model.Entity;
+import org.apache.rya.indexing.entity.storage.EntityStorage.EntityStorageException;
+
+/**
+ * An {@link Entity} could not be created because another entity is a nearly
+ * identical duplicate based on the configured tolerances.
+ */
+public class EntityNearDuplicateException extends EntityStorageException {
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Creates a new instance of {@link EntityNearDuplicateException}.
+ * @param message the message to be displayed by the exception.
+ */
+ public EntityNearDuplicateException(final String message) {
+ super(message);
+ }
+
+ /**
+ * Creates a new instance of {@link EntityNearDuplicateException}.
+ * @param message the message to be displayed by the exception.
+ * @param throwable the source {#link Throwable} cause of the exception.
+ */
+ public EntityNearDuplicateException(final String message, final Throwable throwable) {
+ super(message, throwable);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java
new file mode 100644
index 0000000..772522c
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import static java.util.Objects.requireNonNull;
+
+import java.text.NumberFormat;
+
+/**
+ * The types of methods available to use for calculating tolerance.
+ */
+public class Tolerance {
+ private final Double value;
+ private final ToleranceType toleranceType;
+
+ /**
+ * Creates a new instance of {@link Tolerance}.
+ * @param value the tolerance value. (not {@code null})
+ * @param toleranceType the {@link ToleranceType}. (not {@code null})
+ */
+ public Tolerance(final Double value, final ToleranceType toleranceType) {
+ this.value = requireNonNull(value);
+ this.toleranceType = requireNonNull(toleranceType);
+ }
+
+ /**
+ * @return the tolerance value.
+ */
+ public Double getValue() {
+ return value;
+ }
+
+ /**
+ * @return the {@link ToleranceType}.
+ */
+ public ToleranceType getToleranceType() {
+ return toleranceType;
+ }
+
+ @Override
+ public String toString() {
+ switch (toleranceType) {
+ case PERCENTAGE:
+ return NumberFormat.getPercentInstance().format(value);
+ case DIFFERENCE:
+ return value.toString();
+ default:
+ return "Unknown Tolerance Type with value: " + value.toString();
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java
new file mode 100644
index 0000000..29faff1
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * The types of methods available to use for calculating tolerance.
+ */
+public enum ToleranceType {
+ /**
+ * Indicates that the difference between two values must be within the
+ * specified tolerance value to be accepted.
+ */
+ DIFFERENCE,
+ /**
+ * Indicates that the difference between two values divided by the original
+ * value must fall within the specified tolerance percentage value to be
+ * accepted.
+ */
+ PERCENTAGE;
+
+ /**
+ * Returns the {@link ToleranceType} that matches the specified name.
+ * @param name the name to find.
+ * @return the {@link ToleranceType} or {@code null} if none could be found.
+ */
+ public static ToleranceType getToleranceTypeByName(final String name) {
+ if (StringUtils.isNotBlank(name)) {
+ return ToleranceType.valueOf(name);
+ }
+ return null;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java
new file mode 100644
index 0000000..98f65c7
--- /dev/null
+++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.rya.indexing.smarturi.duplication.conf;
+
+import static java.util.Objects.requireNonNull;
+
+import java.text.NumberFormat;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.configuration.ConfigurationException;
+import org.apache.commons.configuration.XMLConfiguration;
+import org.apache.rya.indexing.smarturi.duplication.Tolerance;
+import org.apache.rya.indexing.smarturi.duplication.ToleranceType;
+
+/**
+ * Configuration options for data duplication.
+ */
+public class DuplicateDataConfig {
+ public static final String DEFAULT_CONFIG_FILE_PATH = "conf/duplicate_data_detection_config.xml";
+
+ private Tolerance booleanTolerance;
+ private Tolerance byteTolerance;
+ private Tolerance dateTolerance;
+ private Tolerance doubleTolerance;
+ private Tolerance floatTolerance;
+ private Tolerance integerTolerance;
+ private Tolerance longTolerance;
+ private Tolerance shortTolerance;
+ private Tolerance stringTolerance;
+ private Tolerance uriTolerance;
+
+ private Map<String, List<String>> equivalentTermsMap;
+
+ private boolean isDetectionEnabled;
+
+ /**
+ * Creates a new instance of {@link DuplicateDataConfig}.
+ * @throws ConfigurationException
+ */
+ public DuplicateDataConfig() throws ConfigurationException {
+ this(new XMLConfiguration(DEFAULT_CONFIG_FILE_PATH));
+ }
+
+ /**
+ * Creates a new instance of {@link DuplicateDataConfig}.
+ * @param xmlFilePath the config's XML file path. (not {@code null})
+ * @throws ConfigurationException
+ */
+ public DuplicateDataConfig(final String xmlFileLocation) throws ConfigurationException {
+ this(new XMLConfiguration(requireNonNull(xmlFileLocation)));
+ }
+
+ /**
+ * Creates a new instance of {@link DuplicateDataConfig}.
+ * @param xmlConfig the {@link XMLConfiguration} file. (not {@code null})
+ * @throws ConfigurationException
+ */
+ public DuplicateDataConfig(final XMLConfiguration xmlConfig) throws ConfigurationException {
+ requireNonNull(xmlConfig);
+
+ final Tolerance booleanTolerance = parseTolerance("tolerances.booleanTolerance", xmlConfig);
+ final Tolerance byteTolerance = parseTolerance("tolerances.byteTolerance", xmlConfig);
+ final Tolerance dateTolerance = parseTolerance("tolerances.dateTolerance", xmlConfig);
+ final Tolerance doubleTolerance = parseTolerance("tolerances.doubleTolerance", xmlConfig);
+ final Tolerance floatTolerance = parseTolerance("tolerances.floatTolerance", xmlConfig);
+ final Tolerance integerTolerance = parseTolerance("tolerances.integerTolerance", xmlConfig);
+ final Tolerance longTolerance = parseTolerance("tolerances.longTolerance", xmlConfig);
+ final Tolerance shortTolerance = parseTolerance("tolerances.shortTolerance", xmlConfig);
+ final Tolerance stringTolerance = parseTolerance("tolerances.stringTolerance", xmlConfig);
+ final Tolerance uriTolerance = parseTolerance("tolerances.uriTolerance", xmlConfig);
+
+ final Map<String, List<String>> equivalentTermsMap = parseEquivalentTermsMap(xmlConfig);
+
+ final boolean isDetectionEnabled = xmlConfig.getBoolean("enableDetection", false);
+ init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
+ }
+
+ /**
+ * Creates a new instance of {@link DuplicateDataConfig}.
+ * @param booleanTolerance the {@link Boolean} tolerance value or
+ * {@code null} if not specified.
+ * @param byteTolerance the {@link Byte} tolerance value or {@code null} if
+ * not specified.
+ * @param dateTolerance the {@link Date} tolerance value or {@code null} if
+ * not specified.
+ * @param doubleTolerance the {@link Double} tolerance value or {@code null}
+ * if not specified.
+ * @param floatTolerance the {@link Float} tolerance value or {@code null}
+ * if not specified.
+ * @param integerTolerance the {@link Integer} tolerance value or
+ * {@code null} if not specified.
+ * @param longTolerance the {@link Long} tolerance value or {@code null} if
+ * not specified.
+ * @param shortTolerance the {@link Short} tolerance value or {@code null}
+ * if not specified.
+ * @param stringTolerance the {@link String} tolerance value or {@code null}
+ * if not specified.
+ * @param uriTolerance the {@link URI} tolerance value or {@code null} if
+ * not specified.
+ * @param equivalentTermsMap the {@link Map} of terms that are considered
+ * equivalent to each other. (not {@code null})
+ * @param isDetectionEnabled {@code true} to enable detection. {@code false}
+ * to disable detection.
+ */
+ public DuplicateDataConfig(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+ final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+ final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+ final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+ final boolean isDetectionEnabled)
+ {
+ init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
+ }
+
+ private void init(final Tolerance booleanTolerance, final Tolerance byteTolerance,
+ final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance,
+ final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance,
+ final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap,
+ final boolean isDetectionEnabled)
+ {
+ this.booleanTolerance = booleanTolerance;
+ this.byteTolerance = byteTolerance;
+ this.dateTolerance= dateTolerance;
+ this.doubleTolerance = doubleTolerance;
+ this.floatTolerance = floatTolerance;
+ this.integerTolerance = integerTolerance;
+ this.longTolerance = longTolerance;
+ this.shortTolerance = shortTolerance;
+ this.stringTolerance = stringTolerance;
+ this.uriTolerance = uriTolerance;
+ this.equivalentTermsMap = requireNonNull(equivalentTermsMap);
+ this.isDetectionEnabled = isDetectionEnabled;
+ }
+
+ private static Tolerance parseTolerance(final String key, final XMLConfiguration xmlConfig) throws ConfigurationException {
+ final String type = xmlConfig.getString(key + ".type", null);
+ final ToleranceType toleranceType = ToleranceType.getToleranceTypeByName(type);
+ Double doubleValue = null;
+ if (toleranceType != null) {
+ switch (toleranceType) {
+ case PERCENTAGE:
+ final String value = xmlConfig.getString(key + ".value", null);
+ if (value != null && value.contains("%")) {
+ try {
+ final Number number = NumberFormat.getPercentInstance().parse(value);
+ doubleValue = number.doubleValue();
+ } catch (final ParseException e) {
+ throw new ConfigurationException(e);
+ }
+ } else {
+ doubleValue = xmlConfig.getDouble(key + ".value", null);
+ }
+ if (doubleValue != null) {
+ if (doubleValue < 0) {
+ throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" must be a positive value. Found this value: " + doubleValue);
+ }
+ if (doubleValue > 1) {
+ throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" can NOT be greater than 100%. Found this value: " + doubleValue);
+ }
+ }
+ break;
+ case DIFFERENCE:
+ doubleValue = xmlConfig.getDouble(key + ".value", null);
+ if (doubleValue != null && doubleValue < 0) {
+ throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" must be a positive value. Found this value: " + doubleValue);
+ }
+ break;
+ default:
+ throw new ConfigurationException("Unknown Tolerance Type specified in config for <" + type + ">: " + toleranceType);
+ }
+ if (doubleValue != null) {
+ return new Tolerance(doubleValue, toleranceType);
+ }
+ }
+ return null;
+ }
+
+ private static Map<String, List<String>> parseEquivalentTermsMap(final XMLConfiguration xmlConfig) {
+ final Map<String, List<String>> equivalentTermsMap = new LinkedHashMap<>();
+ final Object prop = xmlConfig.getProperty("termMappings.termMapping.term");
+ if (prop != null) {
+ if (prop instanceof Collection) {
+ final int size = ((Collection<?>) prop).size();
+ for (int i = 0; i < size; i++) {
+ final String termElement = "termMappings.termMapping(" + i + ")";
+ parseTermMapping(termElement, xmlConfig, equivalentTermsMap);
+ }
+ } else {
+ final String termElement = "termMappings.termMapping";
+ parseTermMapping(termElement, xmlConfig, equivalentTermsMap);
+ }
+ }
+ return equivalentTermsMap;
+ }
+
+ private static void parseTermMapping(final String termElement, final XMLConfiguration xmlConfig, final Map<String, List<String>> equivalentTermsMap) {
+ final String term = xmlConfig.getString(termElement + ".term");
+ final Object equivalentProp = xmlConfig.getString(termElement + ".equivalents.equivalent");
+ if (equivalentProp instanceof Collection) {
+ final int equivalentSize = ((Collection<?>) equivalentProp).size();
+ if (term != null && equivalentSize > 1) {
+ final List<String> equivalents = new ArrayList<>();
+ for (int j = 0; j < equivalentSize; j++) {
+ final String equivalent = xmlConfig.getString(termElement + ".equivalents.equivalent(" + j + ")");
+ if (equivalent != null) {
+ equivalents.add(equivalent);
+ }
+ }
+ equivalentTermsMap.put(term, equivalents);
+ }
+ } else {
+ final List<String> equivalents = new ArrayList<>();
+ final String equivalent = xmlConfig.getString(termElement + ".equivalents.equivalent");
+ if (equivalent != null) {
+ equivalents.add(equivalent);
+ if (term != null) {
+ equivalentTermsMap.put(term, equivalents);
+ }
+ }
+ }
+ }
+
+ /**
+ * @return the {@link Boolean} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getBooleanTolerance() {
+ return booleanTolerance;
+ }
+
+ /**
+ * @return the {@link Byte} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getByteTolerance() {
+ return byteTolerance;
+ }
+
+ /**
+ * @return the {@link Date} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getDateTolerance() {
+ return dateTolerance;
+ }
+
+ /**
+ * @return the {@link Double} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getDoubleTolerance() {
+ return doubleTolerance;
+ }
+
+ /**
+ * @return the {@link Float} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getFloatTolerance() {
+ return floatTolerance;
+ }
+
+ /**
+ * @return the {@link Integer} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getIntegerTolerance() {
+ return integerTolerance;
+ }
+
+ /**
+ * @return the {@link Long} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getLongTolerance() {
+ return longTolerance;
+ }
+
+ /**
+ * @return the {@link Short} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getShortTolerance() {
+ return shortTolerance;
+ }
+
+ /**
+ * @return the {@link String} tolerance value or {@code null} if not
+ * specified.
+ */
+ public Tolerance getStringTolerance() {
+ return stringTolerance;
+ }
+
+ /**
+ * @return the {@link URI} tolerance value or {@code null} if not specified.
+ */
+ public Tolerance getUriTolerance() {
+ return uriTolerance;
+ }
+
+ /**
+ * @return the {@link Map} of terms that are considered equivalent to each
+ * other.
+ */
+ public Map<String, List<String>> getEquivalentTermsMap() {
+ return equivalentTermsMap;
+ }
+
+ /**
+ * @return {@code true} to enable detection. {@code false} to disable
+ * detection.
+ */
+ public boolean isDetectionEnabled() {
+ return isDetectionEnabled;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java b/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
index 60efbed..dff271f 100644
--- a/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
+++ b/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java
@@ -245,7 +245,6 @@ public class MongoDbSmartUriTest {
final Entity resultEntity = SmartUriAdapter.deserializeUriEntity(smartUri);
System.out.println(resultEntity);
assertEquals(BOB_ENTITY.getSubject(), resultEntity.getSubject());
- //assertTrue(Paths.get(BOB_ENTITY.getSubject().getData()).equals(Paths.get(resultEntity.getSubject().getData())));
}
@Test