You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ma...@apache.org on 2018/09/21 14:15:15 UTC

svn commit: r1841593 - in /jackrabbit/oak/branches/1.8: ./ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/ oak-lucene/src/main/java/org/apache/jackra...

Author: mattryan
Date: Fri Sep 21 14:15:14 2018
New Revision: 1841593

URL: http://svn.apache.org/viewvc?rev=1841593&view=rev
Log:
OAK-7575 - Search over similar feature vectors (backported to 1.8)

Added:
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
      - copied from r1834326, jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
    jackrabbit/oak/branches/1.8/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
      - copied from r1834326, jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
    jackrabbit/oak/branches/1.8/oak-lucene/src/test/resources/org/apache/jackrabbit/oak/query/fvs.csv
      - copied unchanged from r1834326, jackrabbit/oak/trunk/oak-lucene/src/test/resources/org/apache/jackrabbit/oak/query/fvs.csv
Modified:
    jackrabbit/oak/branches/1.8/   (props changed)
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
    jackrabbit/oak/branches/1.8/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java

Propchange: jackrabbit/oak/branches/1.8/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri Sep 21 14:15:14 2018
@@ -1,3 +1,3 @@
 /jackrabbit/oak/branches/1.0:1665962
-/jackrabbit/oak/trunk
 ,1831163,1831190,1831374,1831560,1831689,1832258,1832376,1832379,1832535,1833308,1833347,1833833,1834112,1834117,1834287,1834291,1834302,1834336,1834428,1834610,1834648-1834649,1834681,1834823,1834857-1834858,1835060,1835518,1835521,1835635,1835642,1835780,1835819,1836121,1836487,1836493,1837057,1837274,1837296,1837326,1837475,1837503,1837547,1837569,1837600,1837657,1837718,1837998,1838076,1838637,1839549,1839570,1839637,1839746,1840024,1840455,1840574
+/jackrabbit/oak/trunk:1820660-1820661,1820729,1820734,1820859,1820861,1820878,1820888,1820947,1821027,1821130,1821140-1821141,1821178,1821237,1821240,1821249,1821258,1821325,1821358,1821361-1821362,1821370,1821375,1821393,1821477,1821487,1821516,1821617,1821663,1821665,1821668,1821681,1821847,1821975-1821983,1822121,1822201,1822207,1822527,1822723,1822808,1822850,1822934,1823135,1823163,1823169,1823172,1823655,1823669,1824196,1824198,1824253,1824255,1824896,1824962,1825065,1825362,1825381,1825442,1825448,1825466,1825470-1825471,1825475,1825523,1825525,1825561,1825619-1825621,1825651,1825654,1825992,1826079,1826090,1826096,1826216,1826237,1826338,1826516,1826532,1826551,1826560,1826638,1826640,1826730,1826932,1826957,1827423,1827472,1827486,1827977,1828349,1828439,1828502,1828529,1828948,1829527,1829534,1829546,1829569,1829587,1829665,1829854,1829864,1829978,1829985,1829987,1829998,1830019,1830048,1830160,1830171,1830197,1830209,1830239,1830347,1830748,1830911,1830923,1831157-1831158
 ,1831163,1831190,1831374,1831560,1831689,1832258,1832376,1832379,1832535,1833308,1833347,1833833,1834112,1834117,1834287,1834291,1834302,1834326,1834336,1834428,1834610,1834648-1834649,1834681,1834823,1834857-1834858,1835060,1835518,1835521,1835635,1835642,1835780,1835819,1836121,1836487,1836493,1837057,1837274,1837296,1837326,1837475,1837503,1837547,1837569,1837600,1837657,1837718,1837998,1838076,1838637,1839549,1839570,1839637,1839746,1840024,1840455,1840574
 /jackrabbit/trunk:1345480

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java Fri Sep 21 14:15:14 2018
@@ -16,15 +16,22 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
+import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 
 import com.google.common.primitives.Ints;
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.plugins.index.lucene.binary.BlobByteSource;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
 import org.apache.jackrabbit.util.ISO8601;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 
@@ -103,6 +110,34 @@ public final class FieldFactory {
         return new StringField(name, value, NO);
     }
 
+    static Collection<Field> newSimilarityFields(String name, Blob value) throws IOException {
+        Collection<Field> fields = new ArrayList<>(1);
+        byte[] bytes = new BlobByteSource(value).read();
+//        fields.add(newBinarySimilarityField(name, bytes));
+        fields.add(newSimilarityField(name, bytes));
+        return fields;
+    }
+
+    static Collection<Field> newSimilarityFields(String name, String value) {
+        Collection<Field> fields = new ArrayList<>(1);
+//        byte[] bytes = SimSearchUtils.toByteArray(value);
+//        fields.add(newBinarySimilarityField(name, bytes));
+        fields.add(newSimilarityField(name, value));
+        return fields;
+    }
+
+    private static Field newSimilarityField(String name, byte[] bytes) {
+        return newSimilarityField(name, SimSearchUtils.toDoubleString(bytes));
+    }
+
+    private static Field newSimilarityField(String name, String value) {
+        return new TextField(FieldNames.createSimilarityFieldName(name), value, Field.Store.YES);
+    }
+
+    private static StoredField newBinarySimilarityField(String name, byte[] bytes) {
+        return new StoredField(FieldNames.createBinSimilarityFieldName(name), bytes);
+    }
+
     public static Field newFulltextField(String value) {
         return newFulltextField(value, false);
     }

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java Fri Sep 21 14:15:14 2018
@@ -70,6 +70,16 @@ public final class FieldNames {
     public static final String ANALYZED_FIELD_PREFIX = "full:";
 
     /**
+     * Name of the field that contains the similarity search indexed tokens.
+     */
+    private static final String SIMILARITY_PREFIX = "sim:";
+
+    /**
+     * Prefix for all field names that contains the similarity search binary values.
+     */
+    private static final String SIMILARITY_BINARY_PREFIX = "simbin:";
+
+    /**
      * Prefix used for storing fulltext of relative node
      */
     public static final String FULLTEXT_RELATIVE_NODE = "fullnode:";
@@ -138,4 +148,12 @@ public final class FieldNames {
                 && !field.startsWith(":")
                 && !field.endsWith("_facet");
     }
+
+    public static String createBinSimilarityFieldName(String name) {
+        return SIMILARITY_BINARY_PREFIX + name;
+    }
+
+    public static String createSimilarityFieldName(String name) {
+        return SIMILARITY_PREFIX + name;
+    }
 }

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java Fri Sep 21 14:15:14 2018
@@ -911,6 +911,7 @@ public final class IndexDefinition imple
         private final List<PropertyDefinition> notNullCheckEnabledProperties;
         private final List<PropertyDefinition> nodeScopeAnalyzedProps;
         private final List<PropertyDefinition> syncProps;
+        private final List<PropertyDefinition> similarityProperties;
         private final boolean indexesAllNodesOfMatchingType;
         private final boolean nodeNameIndexed;
 
@@ -925,6 +926,7 @@ public final class IndexDefinition imple
         final Aggregate propAggregate;
 
 
+
         IndexingRule(String nodeTypeName, NodeState config) {
             this.nodeTypeName = nodeTypeName;
             this.baseNodeType = nodeTypeName;
@@ -938,9 +940,10 @@ public final class IndexDefinition imple
             List<PropertyDefinition> existentProperties = newArrayList();
             List<PropertyDefinition> nodeScopeAnalyzedProps = newArrayList();
             List<PropertyDefinition> syncProps = newArrayList();
+            List<PropertyDefinition> similarityProperties = newArrayList();
             List<Aggregate.Include> propIncludes = newArrayList();
             this.propConfigs = collectPropConfigs(config, namePatterns, propIncludes, nonExistentProperties,
-                    existentProperties, nodeScopeAnalyzedProps, functionRestrictions, syncProps);
+                    existentProperties, nodeScopeAnalyzedProps, functionRestrictions, syncProps, similarityProperties);
             this.propAggregate = new Aggregate(nodeTypeName, propIncludes);
             this.aggregate = combine(propAggregate, nodeTypeName);
 
@@ -949,6 +952,7 @@ public final class IndexDefinition imple
             this.nullCheckEnabledProperties = ImmutableList.copyOf(nonExistentProperties);
             this.functionRestrictions = ImmutableList.copyOf(functionRestrictions);
             this.notNullCheckEnabledProperties = ImmutableList.copyOf(existentProperties);
+            this.similarityProperties = ImmutableList.copyOf(similarityProperties);
             this.fulltextEnabled = aggregate.hasNodeAggregates() || hasAnyFullTextEnabledProperty();
             this.nodeFullTextIndexed = aggregate.hasNodeAggregates() || anyNodeScopeIndexedProperty();
             this.propertyIndexEnabled = hasAnyPropertyIndexConfigured();
@@ -985,6 +989,7 @@ public final class IndexDefinition imple
             this.indexesAllNodesOfMatchingType = areAlMatchingNodeByTypeIndexed();
             this.nodeNameIndexed = original.nodeNameIndexed;
             this.syncProps = original.syncProps;
+            this.similarityProperties = original.similarityProperties;
         }
 
         /**
@@ -1032,6 +1037,10 @@ public final class IndexDefinition imple
             return nodeScopeAnalyzedProps;
         }
 
+        public List<PropertyDefinition> getSimilarityProperties() {
+            return similarityProperties;
+        }
+
         @Override
         public String toString() {
             String str = "IndexRule: "+ nodeTypeName;
@@ -1153,7 +1162,8 @@ public final class IndexDefinition imple
                                                                    List<PropertyDefinition> existentProperties,
                                                                    List<PropertyDefinition> nodeScopeAnalyzedProps,
                                                                    List<PropertyDefinition> functionRestrictions,
-                                                                   List<PropertyDefinition> syncProps) {
+                                                                   List<PropertyDefinition> syncProps,
+                                                                   List<PropertyDefinition> similarityProperties) {
             Map<String, PropertyDefinition> propDefns = newHashMap();
             NodeState propNode = config.getChildNode(LuceneIndexConstants.PROP_NODE);
 
@@ -1232,6 +1242,9 @@ public final class IndexDefinition imple
                     if (pd.sync) {
                         syncProps.add(pd);
                     }
+                    if (pd.useInSimilarity) {
+                        similarityProperties.add(pd);
+                    }
                 }
             }
             ensureNodeTypeIndexingIsConsistent(propDefns, syncProps);

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java Fri Sep 21 14:15:14 2018
@@ -62,6 +62,7 @@ import static org.apache.jackrabbit.oak.
 import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newFulltextField;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPathField;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPropertyField;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newSimilarityFields;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.util.ConfigUtil.getPrimaryTypeName;
 
 public class LuceneDocumentMaker {
@@ -253,7 +254,15 @@ public class LuceneDocumentMaker {
         boolean includeTypeForFullText = indexingRule.includePropertyType(property.getType().tag());
 
         boolean dirty = false;
-        if (Type.BINARY.tag() == property.getType().tag()
+        if (Type.BINARY.tag() == property.getType().tag() && pd.useInSimilarity) {
+            try {
+                log.trace("indexing similarity binaries for {}", pd.name);
+                fields.addAll(newSimilarityFields(pd.name, property.getValue(Type.BINARY)));
+                dirty = true;
+            } catch (Exception e) {
+                log.error("could not index similarity field for property {} and definition {}", property, pd);
+            }
+        } else if (Type.BINARY.tag() == property.getType().tag()
                 && includeTypeForFullText) {
             fields.addAll(newBinary(property, state, null, path + "@" + pname));
             dirty = true;
@@ -285,10 +294,17 @@ public class LuceneDocumentMaker {
                     if (pd.nodeScopeIndex) {
                         Field field = newFulltextField(value);
                         fields.add(field);
+                        if (pd.useInSimilarity) {
+                            log.trace("indexing similarity strings for {}", pd.name);
+                            fields.addAll(newSimilarityFields(pd.name, value)); // fallback for when feature vectors are written in string typed properties
+                       }
                     }
+
+
                     dirty = true;
                 }
             }
+
             if (pd.facet && isFacetingEnabled()) {
                 dirty |= addFacetFields(fields, property, pname, pd);
             }

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java Fri Sep 21 14:15:14 2018
@@ -303,6 +303,11 @@ public interface LuceneIndexConstants {
     String PROP_USE_IN_SPELLCHECK = "useInSpellcheck";
 
     /**
+     * whether use this property values for similarity
+     */
+    String PROP_USE_IN_SIMILARITY = "useInSimilarity";
+
+    /**
      * Property definition config indicating that null check support should be
      * enabled for this property
      */

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java Fri Sep 21 14:15:14 2018
@@ -18,10 +18,6 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
-import javax.annotation.CheckForNull;
-import javax.annotation.Nonnull;
-import javax.annotation.Nullable;
-import javax.jcr.PropertyType;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -34,6 +30,12 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
 
+import javax.annotation.CheckForNull;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import javax.jcr.PropertyType;
+
+import com.google.common.base.Joiner;
 import com.google.common.collect.AbstractIterator;
 import com.google.common.collect.FluentIterable;
 import com.google.common.collect.Iterables;
@@ -49,6 +51,8 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.commons.PerfLogger;
 import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
 import org.apache.jackrabbit.oak.commons.json.JsopWriter;
+import org.apache.jackrabbit.oak.plugins.index.Cursors;
+import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor;
 import org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.IndexingRule;
 import org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PlanResult;
 import org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PropertyIndexResult;
@@ -60,16 +64,9 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.PathStoredFieldVisitor;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.SpellcheckHelper;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
 import org.apache.jackrabbit.oak.plugins.memory.PropertyValues;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
-import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
 import org.apache.jackrabbit.oak.spi.query.Cursor;
-import org.apache.jackrabbit.oak.plugins.index.Cursors;
-import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor;
 import org.apache.jackrabbit.oak.spi.query.Filter;
 import org.apache.jackrabbit.oak.spi.query.Filter.PropertyRestriction;
 import org.apache.jackrabbit.oak.spi.query.IndexRow;
@@ -77,6 +74,12 @@ import org.apache.jackrabbit.oak.spi.que
 import org.apache.jackrabbit.oak.spi.query.QueryIndex;
 import org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvanceFulltextQueryIndex;
 import org.apache.jackrabbit.oak.spi.query.QueryLimits;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
+import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStateUtils;
 import org.apache.lucene.analysis.Analyzer;
@@ -145,7 +148,9 @@ import static org.apache.jackrabbit.oak.
 import static org.apache.jackrabbit.oak.spi.query.QueryConstants.JCR_PATH;
 import static org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvancedQueryIndex;
 import static org.apache.jackrabbit.oak.spi.query.QueryIndex.NativeQueryIndex;
-import static org.apache.lucene.search.BooleanClause.Occur.*;
+import static org.apache.lucene.search.BooleanClause.Occur.MUST;
+import static org.apache.lucene.search.BooleanClause.Occur.MUST_NOT;
+import static org.apache.lucene.search.BooleanClause.Occur.SHOULD;
 
 /**
  * Provides a QueryIndex that does lookups against a Lucene-based index
@@ -860,9 +865,20 @@ public class LucenePropertyIndex impleme
             if (query.startsWith("mlt?")) {
                 String mltQueryString = query.replace("mlt?", "");
                 if (reader != null) {
-                    Query moreLikeThis = MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString);
-                    if (moreLikeThis != null) {
-                        qs.add(moreLikeThis);
+                    List<PropertyDefinition> sp = new LinkedList<>();
+                    for (IndexingRule r : defn.getDefinedRules()) {
+                        sp.addAll(r.getSimilarityProperties());
+                    }
+                    if (sp.isEmpty()) {
+                        Query moreLikeThis = MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString);
+                        if (moreLikeThis != null) {
+                            qs.add(moreLikeThis);
+                        }
+                    } else {
+                        Query similarityQuery = SimSearchUtils.getSimilarityQuery(sp, reader, mltQueryString);
+                        if (similarityQuery != null) {
+                            qs.add(similarityQuery);
+                        }
                     }
                 }
             } else if (query.startsWith("spellcheck?")) {

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java Fri Sep 21 14:15:14 2018
@@ -58,7 +58,7 @@ public class PropertyDefinition {
      * property etc then it should be defined via 'name' property in NodeState.
      * In such case NodeState name can be set to anything
      */
-    final String name;
+    public final String name;
 
     private final int propertyType;
     /**
@@ -123,7 +123,9 @@ public class PropertyDefinition {
 
     public final boolean unique;
 
-    public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState defn) {
+    public boolean useInSimilarity;
+
+  public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState defn) {
         this.isRegexp = getOptionalValue(defn, PROP_IS_REGEX, false);
         this.name = getName(defn, nodeName);
         this.relative = isRelativeProperty(name);
@@ -151,6 +153,7 @@ public class PropertyDefinition {
         this.propertyType = getPropertyType(idxDefn, nodeName, defn);
         this.useInSuggest = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_USE_IN_SUGGEST, false);
         this.useInSpellcheck = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_USE_IN_SPELLCHECK, false);
+        this.useInSimilarity = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_USE_IN_SIMILARITY, false);
         this.nullCheckEnabled = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_NULL_CHECK_ENABLED, false);
         this.notNullCheckEnabled = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_NOT_NULL_CHECK_ENABLED, false);
         this.excludeFromAggregate = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_EXCLUDE_FROM_AGGREGATE, false);

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java Fri Sep 21 14:15:14 2018
@@ -319,6 +319,11 @@ public final class IndexDefinitionBuilde
             return this;
         }
 
+        public PropertyRule useInSimilarity() {
+            propTree.setProperty(LuceneIndexConstants.PROP_USE_IN_SIMILARITY, true);
+            return this;
+        }
+
         public PropertyRule type(String type){
             //This would throw an IAE if type is invalid
             PropertyType.valueFromName(type);

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java Fri Sep 21 14:15:14 2018
@@ -20,12 +20,15 @@
 package org.apache.jackrabbit.oak.plugins.index.lucene.writer;
 
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 
 import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition;
 import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants;
+import org.apache.jackrabbit.oak.plugins.index.lucene.PropertyDefinition;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.LSHAnalyzer;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
@@ -51,6 +54,15 @@ public class IndexWriterUtils {
             Analyzer definitionAnalyzer = definition.getAnalyzer();
             Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
             analyzers.put(FieldNames.SPELLCHECK, new ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3));
+            for (IndexDefinition.IndexingRule r : definition.getDefinedRules()) {
+                List<PropertyDefinition> similarityProperties = r.getSimilarityProperties();
+                for (PropertyDefinition pd : similarityProperties) {
+                    if (pd.useInSimilarity) {
+                        analyzers.put(FieldNames.createSimilarityFieldName(pd.name), new LSHAnalyzer());
+                    }
+                }
+            }
+
             if (!definition.isSuggestAnalyzed()) {
                 analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer());
             }

Modified: jackrabbit/oak/branches/1.8/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.8/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1841593&r1=1841592&r2=1841593&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.8/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/branches/1.8/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Fri Sep 21 14:15:14 2018
@@ -19,9 +19,6 @@
 
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
-import javax.annotation.Nonnull;
-import javax.jcr.PropertyType;
-
 import static com.google.common.collect.ImmutableSet.of;
 import static com.google.common.collect.Lists.newArrayList;
 import static java.util.Arrays.asList;
@@ -72,11 +69,16 @@ import static org.junit.Assert.assertNul
 import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URI;
+import java.nio.charset.Charset;
 import java.text.ParseException;
 import java.util.Calendar;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.LinkedList;
@@ -87,6 +89,9 @@ import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 
+import javax.annotation.Nonnull;
+import javax.jcr.PropertyType;
+
 import com.google.common.base.Charsets;
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.ImmutableList;
@@ -98,6 +103,7 @@ import com.google.common.io.CountingInpu
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.InitialContent;
 import org.apache.jackrabbit.oak.Oak;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -117,13 +123,13 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.plugins.index.lucene.directory.CopyOnReadDirectory;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.IndexDefinitionBuilder;
+import org.apache.jackrabbit.oak.plugins.index.lucene.util.fv.SimSearchUtils;
 import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider;
 import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
 import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
 import org.apache.jackrabbit.oak.plugins.memory.PropertyStates;
 import org.apache.jackrabbit.oak.plugins.nodetype.TypeEditorProvider;
-import org.apache.jackrabbit.oak.InitialContent;
 import org.apache.jackrabbit.oak.plugins.nodetype.write.NodeTypeRegistry;
 import org.apache.jackrabbit.oak.query.AbstractQueryTest;
 import org.apache.jackrabbit.oak.spi.commit.CommitInfo;
@@ -2747,9 +2753,114 @@ public class LucenePropertyIndexTest ext
                 "lucene:test1(/oak:index/test1)", asList("/d"));
     }
 
-    private void assertPlanAndQuery(String query, String planExpectation, List<String> paths){
+    @Test
+    public void testRepSimilarWithBinaryFeatureVectors() throws Exception {
+
+        IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync();
+        idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            String[] split = line.split(",");
+            List<Double> values = new LinkedList<>();
+            int i = 0;
+            for (String s : split) {
+                if (i > 0) {
+                    values.add(Double.parseDouble(s));
+                }
+                i++;
+            }
+
+            byte[] bytes = SimSearchUtils.toByteArray(values);
+            List<Double> actual = SimSearchUtils.toDoubles(bytes);
+            assertEquals(values, actual);
+
+            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+            String name = split[0];
+            Tree child = test.addChild(name);
+            child.setProperty("fv", blob, Type.BINARY);
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+
+            Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
+            List<String> current = new LinkedList<>();
+            while (result.hasNext()) {
+                String next = result.next();
+                current.add(next);
+            }
+            assertNotEquals(baseline, current);
+            baseline.clear();
+            baseline.addAll(current);
+        }
+
+    }
+
+    @Test
+    public void testRepSimilarWithStringFeatureVectors() throws Exception {
+
+        IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync();
+        idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            int i1 = line.indexOf(',');
+            String name = line.substring(0, i1);
+            String value = line.substring(i1 + 1);
+            Tree child = test.addChild(name);
+            child.setProperty("fv", value, Type.STRING);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+
+            Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
+            List<String> current = new LinkedList<>();
+            while (result.hasNext()) {
+                String next = result.next();
+                current.add(next);
+            }
+            assertNotEquals(baseline, current);
+            baseline.clear();
+            baseline.addAll(current);
+        }
+    }
+
+    private void assertPlanAndQuery(String query, String planExpectation, List<String> paths) {
+        assertPlanAndQuery(query, planExpectation, paths, false);
+    }
+
+    private void assertPlanAndQuery(String query, String planExpectation, List<String> paths, boolean ordered) {
         assertThat(explain(query), containsString(planExpectation));
-        assertQuery(query, paths);
+        assertQuery(query, SQL2, paths, ordered);
     }
 
     private static Tree createNodeWithMixinType(Tree t, String nodeName, String typeName){