You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by to...@apache.org on 2018/09/19 06:46:43 UTC
svn commit: r1841291 [1/3] - in /jackrabbit/oak/trunk/oak-search/src:
main/java/org/apache/jackrabbit/oak/plugins/index/search/
main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/
main/java/org/apache/jackrabbit/oak/plugins/index/searc...
Author: tommaso
Date: Wed Sep 19 06:46:42 2018
New Revision: 1841291
URL: http://svn.apache.org/viewvc?rev=1841291&view=rev
Log:
OAK-3336 - adjusted oak-search SPIs
Added:
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/BlobByteSource.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/update/
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/update/IndexUpdateListener.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/update/ReaderRefreshPolicy.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/update/RefreshOnReadPolicy.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/update/RefreshOnWritePolicy.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/update/TimedRefreshPolicy.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/DataConversionUtil.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionUtils.java (with props)
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/NodeStateCloner.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNamesTest.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractorTest.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfigTest.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndexTest.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/update/
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/update/RecordingRunnable.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/update/RefreshOnReadPolicyTest.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/update/RefreshOnWritePolicyTest.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/update/TimedRefreshPolicyTest.java (with props)
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/util/NodeStateClonerTest.java (with props)
Removed:
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexUpdateListener.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/NodeStateCloner.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ReaderRefreshPolicy.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/RefreshOnReadPolicy.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/RefreshOnWritePolicy.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/TimedRefreshPolicy.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/BlobByteSource.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextBinaryTextExtractor.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/TextExtractionStats.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/TikaParserConfig.java
Modified:
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/Aggregate.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/BadIndexTracker.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexFormatVersion.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexLookup.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexNode.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexStatistics.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ReindexOperations.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/SizeEstimator.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/TextExtractionStatsMBean.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextIndexEditor.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextIndexEditorContext.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextIndexWriterFactory.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/IndexNodeManager.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/FunctionIndexProcessor.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexHelper.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/NodeStateCopyUtils.java
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/AggregateTest.java
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCacheTest.java
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinitionTest.java
jackrabbit/oak/trunk/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/util/NodeStateCopyUtilsTest.java
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/Aggregate.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/Aggregate.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/Aggregate.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/Aggregate.java Wed Sep 19 06:46:42 2018
@@ -46,6 +46,15 @@ import static com.google.common.collect.
import static org.apache.jackrabbit.oak.commons.PathUtils.elements;
import static org.apache.jackrabbit.oak.commons.PathUtils.getParentPath;
+/**
+ * Aggregates text from child nodes for fulltext queries.
+ *
+ * Example: let's say node /x is of type 'web page', but the actual content is
+ * stored in child nodes; say /x/section1 contains "Hello" and /x/section2
+ * contains "World". If index aggregation is configured correctly, it will
+ * combine all the text of the child nodes, and index that as /x. When doing a
+ * fulltext search for for "Hello World", the index will then return /x.
+ */
public class Aggregate {
public static final String MATCH_ALL = "*";
@@ -56,7 +65,7 @@ public class Aggregate {
public static final int RECURSIVE_AGGREGATION_LIMIT_DEFAULT = 5;
private final String nodeTypeName;
private final List<? extends Include> includes;
- final int reAggregationLimit;
+ public final int reAggregationLimit;
private final List<NodeInclude> relativeNodeIncludes;
private final boolean nodeAggregates;
@@ -64,7 +73,7 @@ public class Aggregate {
this(nodeTypeName, Collections.<Include>emptyList());
}
- Aggregate(String nodeTypeName, List<? extends Include> includes) {
+ public Aggregate(String nodeTypeName, List<? extends Include> includes) {
this(nodeTypeName, includes, RECURSIVE_AGGREGATION_LIMIT_DEFAULT);
}
@@ -137,7 +146,7 @@ public class Aggregate {
}
private static void collectAggregatesForDirectMatchers(NodeState nodeState, List<Matcher> matchers,
- ResultCollector collector) {
+ ResultCollector collector) {
Map<String, ChildNodeEntry> children = Maps.newHashMap();
//Collect potentially matching child nodestates based on matcher name
for (Matcher m : matchers){
@@ -151,7 +160,7 @@ public class Aggregate {
}
private static void collectAggregatesForPatternMatchers(NodeState nodeState, List<Matcher> matchers,
- ResultCollector collector) {
+ ResultCollector collector) {
matchChildren(matchers, collector, nodeState.getChildNodeEntries());
}
@@ -214,14 +223,14 @@ public class Aggregate {
});
}
- public static interface AggregateMapper {
+ public interface AggregateMapper {
@Nullable
Aggregate getAggregate(String nodeTypeName);
}
//~-----------------------------------------------------< Includes >
- public static abstract class Include<T> {
+ public static abstract class Include {
protected final String[] elements;
public Include(String pattern) {
@@ -232,18 +241,15 @@ public class Aggregate {
String element = elements[depth];
if (MATCH_ALL.equals(element)) {
return true;
- } else if (element.equals(name)) {
- return true;
- }
- return false;
+ } else return element.equals(name);
}
public int maxDepth() {
return elements.length;
}
- public void collectResults(T rootInclude, String rootIncludePath,
- String nodePath, NodeState nodeState, ResultCollector results) {
+ public void collectResults(Include rootInclude, String rootIncludePath,
+ String nodePath, NodeState nodeState, ResultCollector results) {
collectResults(nodePath, nodeState, results);
}
@@ -271,9 +277,9 @@ public class Aggregate {
}
}
- public static class NodeInclude extends Include<NodeInclude> {
- final String primaryType;
- final boolean relativeNode;
+ public static class NodeInclude extends Include {
+ public final String primaryType;
+ public final boolean relativeNode;
private final String pattern;
private final AggregateMapper aggMapper;
@@ -302,9 +308,13 @@ public class Aggregate {
}
@Override
- public void collectResults(NodeInclude rootInclude, String rootIncludePath, String nodePath,
+ public void collectResults(Include include, String rootIncludePath, String nodePath,
NodeState nodeState, ResultCollector results) {
//For supporting jcr:contains(jcr:content, 'foo')
+ if (!(include instanceof NodeInclude)) {
+ throw new IllegalArgumentException("" + include);
+ }
+ NodeInclude rootInclude = (NodeInclude) include;
if (rootInclude.relativeNode){
results.onResult(new NodeIncludeResult(nodePath, rootIncludePath, nodeState));
}
@@ -364,7 +374,7 @@ public class Aggregate {
}
}
- public static class PropertyInclude extends Include<PropertyInclude> {
+ public static class PropertyInclude extends Include {
private final PropertyDefinition propertyDefinition;
private final String propertyName;
private final Pattern pattern;
@@ -417,7 +427,7 @@ public class Aggregate {
}
}
- public static interface ResultCollector {
+ public interface ResultCollector {
void onResult(NodeIncludeResult result);
void onResult(PropertyIncludeResult result);
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/BadIndexTracker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/BadIndexTracker.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/BadIndexTracker.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/BadIndexTracker.java Wed Sep 19 06:46:42 2018
@@ -30,6 +30,16 @@ import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * Track of bad (corrupt) indexes.
+ *
+ * An index can be corrupt for reads (an exception was thrown when index was
+ * opened for query), and persistent (an exception was thrown when index is
+ * reopened after an update).
+ *
+ * Indexes marked bad for reads might become good again later, if another
+ * cluster node fixed the corruption (eg. by reindexing).
+ */
public class BadIndexTracker {
/**
* Time interval in millis after which a bad index would be accessed again
@@ -117,15 +127,15 @@ public class BadIndexTracker {
return badIndexesForRead.keySet();
}
- BadIndexInfo getInfo(String indexPath){
+ public BadIndexInfo getInfo(String indexPath){
return badIndexesForRead.get(indexPath);
}
- Set<String> getBadPersistedIndexPaths() {
+ public Set<String> getBadPersistedIndexPaths() {
return badPersistedIndexes.keySet();
}
- BadIndexInfo getPersistedIndexInfo(String indexPath){
+ public BadIndexInfo getPersistedIndexInfo(String indexPath){
return badPersistedIndexes.get(indexPath);
}
@@ -133,7 +143,7 @@ public class BadIndexTracker {
return recheckIntervalMillis;
}
- void setTicker(Ticker ticker) {
+ public void setTicker(Ticker ticker) {
this.ticker = ticker;
}
@@ -141,8 +151,8 @@ public class BadIndexTracker {
return !(badIndexesForRead.isEmpty() && badPersistedIndexes.isEmpty());
}
- class BadIndexInfo {
- final String path;
+ public class BadIndexInfo {
+ public final String path;
final int lastIndexerCycleCount = indexerCycleCount;
private final long createdTime = TimeUnit.NANOSECONDS.toMillis(ticker.read());
private final boolean persistedIndex;
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java Wed Sep 19 06:46:42 2018
@@ -54,6 +54,10 @@ import org.slf4j.LoggerFactory;
import static org.apache.jackrabbit.oak.commons.PathUtils.concat;
+/**
+ * A cache to avoid extracting text of binaries that were already processed (in
+ * a different node that references the same binary).
+ */
public class ExtractedTextCache {
private static final boolean CACHE_ONLY_SUCCESS =
Boolean.getBoolean("oak.extracted.cacheOnlySuccess");
@@ -75,7 +79,10 @@ public class ExtractedTextCache {
private long totalTextSize;
private long totalTime;
private int preFetchedCount;
+
+ // the actual cache. key: content id, value: extracted text
private final Cache<String, String> cache;
+
private final ConcurrentHashMap<String, String> timeoutMap;
private final File indexDir;
private final CacheStats cacheStats;
@@ -104,7 +111,7 @@ public class ExtractedTextCache {
cacheStats = null;
}
this.alwaysUsePreExtractedCache = alwaysUsePreExtractedCache;
- this.timeoutMap = new ConcurrentHashMap<String, String>();
+ this.timeoutMap = new ConcurrentHashMap<>();
this.indexDir = indexDir;
loadTimeoutMap();
}
@@ -241,13 +248,13 @@ public class ExtractedTextCache {
return extractedTextProvider;
}
- void resetCache(){
+ public void resetCache(){
if (cache != null){
cache.invalidateAll();
}
}
- boolean isAlwaysUsePreExtractedCache() {
+ public boolean isAlwaysUsePreExtractedCache() {
return alwaysUsePreExtractedCache;
}
@@ -283,7 +290,7 @@ public class ExtractedTextCache {
closeExecutorService();
}
- public void process(String name, Callable<Void> callable) throws InterruptedException, Throwable {
+ public void process(String name, Callable<Void> callable) throws Throwable {
Callable<Void> callable2 = new Callable<Void>() {
@Override
public Void call() throws Exception {
@@ -305,9 +312,7 @@ public class ExtractedTextCache {
future.get(extractionTimeoutMillis, TimeUnit.MILLISECONDS);
}
} catch (TimeoutException e) {
- timeoutCount++;
- throw e;
- } catch (InterruptedException e) {
+ timeoutCount++; // TODO : use AtomicInteger ? this is a non-atomic operation on a volatile field
throw e;
} catch (ExecutionException e) {
throw e.getCause();
@@ -332,7 +337,7 @@ public class ExtractedTextCache {
log.debug("ExtractedTextCache createExecutor " + this);
ThreadPoolExecutor executor = new ThreadPoolExecutor(1, EXTRACTION_MAX_THREADS,
60L, TimeUnit.SECONDS,
- new LinkedBlockingQueue<Runnable>(), new ThreadFactory() {
+ new LinkedBlockingQueue<>(), new ThreadFactory() {
private final AtomicInteger counter = new AtomicInteger();
private final Thread.UncaughtExceptionHandler handler = new Thread.UncaughtExceptionHandler() {
@Override
Added: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java?rev=1841291&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java (added)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java Wed Sep 19 06:46:42 2018
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.search;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Defines field names that are used internally to store data in the
+ * search index.
+ */
+public final class FieldNames {
+
+
+ /**
+ * Private constructor.
+ */
+ private FieldNames() {
+ }
+
+ /**
+ * Name of the field that contains the {@value} property of the node.
+ */
+ public static final String PATH = ":path";
+
+ /**
+ * Name of the field that contains all the path hierarchy e.g. for /a/b/c
+ * it would contain /a, /a/b, /a/b/c
+ */
+ public static final String ANCESTORS = ":ancestors";
+
+ /**
+ * Name of the field which refers to the depth of path
+ */
+ public static final String PATH_DEPTH = ":depth";
+
+ /**
+ * Name of the field that contains the fulltext index.
+ */
+ public static final String FULLTEXT = ":fulltext";
+
+ /**
+ * Name of the field that contains the similarity search indexed tokens.
+ */
+ private static final String SIMILARITY_PREFIX = "sim:";
+
+ /**
+ * Name of the field that contains the suggest index.
+ */
+ public static final String SUGGEST = ":suggest";
+
+ /**
+ * Name of the field that contains the spellcheck index.
+ */
+ public static final String SPELLCHECK = ":spellcheck";
+
+ /**
+ * Prefix for all field names that are fulltext indexed by property name.
+ */
+ public static final String ANALYZED_FIELD_PREFIX = "full:";
+
+ /**
+ * Prefix used for storing fulltext of relative node
+ */
+ public static final String FULLTEXT_RELATIVE_NODE = "fullnode:";
+
+ /**
+ * Name of the field that contains those property names which are not found
+ * (or were null) for the given
+ */
+ public static final String NULL_PROPS = ":nullProps";
+
+ /**
+ * Name of the field that contains those property names which are exist i.e. not null
+ * for the given NodeState
+ */
+ public static final String NOT_NULL_PROPS = ":notNullProps";
+
+ /**
+ * Name of the field that contains the node name
+ */
+ public static final String NODE_NAME = ":nodeName";
+
+ /**
+ * Suffix of the fields that contains function values
+ */
+ public static final String FUNCTION_PREFIX = "function*";
+
+ /**
+ * Used to select only the PATH field from the lucene documents
+ */
+ public static final Set<String> PATH_SELECTOR = new HashSet<>(
+ Collections.singletonList(PATH));
+
+ /**
+ * Encodes the field name such that it can be used for storing DocValue
+ * This is done such a field if used for both sorting and querying uses
+ * a different name for docvalue field
+ *
+ * @param name name to encode
+ * @return encoded field name
+ */
+ public static String createDocValFieldName(String name){
+ return ":dv" + name;
+ }
+
+ public static String createAnalyzedFieldName(String pname) {
+ return ANALYZED_FIELD_PREFIX + pname;
+ }
+
+ public static String createFulltextFieldName(String nodeRelativePath) {
+ if (nodeRelativePath == null){
+ return FULLTEXT;
+ }
+ return FULLTEXT_RELATIVE_NODE + nodeRelativePath;
+ }
+
+ public static String createFacetFieldName(String pname) {
+ return pname + "_facet";
+ }
+
+ /**
+ * @return if {@code field} represents a field property indexed data
+ */
+ public static boolean isPropertyField(String field) {
+ return !field.startsWith(ANALYZED_FIELD_PREFIX)
+ && !field.startsWith(FULLTEXT_RELATIVE_NODE)
+ && !field.startsWith(":")
+ && !field.endsWith("_facet");
+ }
+
+ public static String createSimilarityFieldName(String name) {
+ return SIMILARITY_PREFIX + name;
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java Wed Sep 19 06:46:42 2018
@@ -16,19 +16,19 @@
*/
package org.apache.jackrabbit.oak.plugins.index.search;
-
+/**
+ * Internal constants used in index definition, and index implementations.
+ */
public interface FulltextIndexConstants {
- enum IndexingMode {
- SYNC,
- NRT,
- ASYNC;
+ enum IndexingMode {
+ SYNC, NRT, ASYNC;
- public String asyncValueName(){
+ public String asyncValueName() {
return name().toLowerCase();
}
- public static IndexingMode from(String indexingMode){
+ public static IndexingMode from(String indexingMode) {
return valueOf(indexingMode.toUpperCase());
}
}
@@ -192,6 +192,7 @@ public interface FulltextIndexConstants
/**
* Config node which include Tika related configuration
+ * Its value should match {@link FieldNames#NODE_NAME}
*/
String TIKA = "tika";
@@ -248,6 +249,11 @@ public interface FulltextIndexConstants
*/
String PROP_USE_IN_SPELLCHECK = "useInSpellcheck";
+ /**
+ * whether use this property values for similarity
+ */
+ String PROP_USE_IN_SIMILARITY = "useInSimilarity";
+
/**
* Property definition config indicating that null check support should be
* enabled for this property
@@ -290,6 +296,12 @@ public interface FulltextIndexConstants
String COMPAT_MODE = "compatVersion";
/**
+ * Optional (index definition) property indicating whether facets should be ACL checked.
+ * Default is true
+ */
+ String PROP_SECURE_FACETS = "secure";
+
+ /**
* Optional (index definition) property indicating max number of facets that will be retrieved
* in query
* Default is {@link IndexDefinition#DEFAULT_FACET_COUNT}
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexFormatVersion.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexFormatVersion.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexFormatVersion.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexFormatVersion.java Wed Sep 19 06:46:42 2018
@@ -19,6 +19,11 @@
package org.apache.jackrabbit.oak.plugins.index.search;
+/**
+ * The version of an index (property "compatVersion").
+ *
+ * The default is version 2. Version 1 is supported for backward compatibility.
+ */
public enum IndexFormatVersion {
/**
* Index confirming to Oak version upto 1.0.8
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexLookup.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexLookup.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexLookup.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexLookup.java Wed Sep 19 06:46:42 2018
@@ -21,6 +21,7 @@ package org.apache.jackrabbit.oak.plugin
import java.util.Collection;
import java.util.Set;
+import java.util.function.Predicate;
import com.google.common.collect.Sets;
import org.apache.jackrabbit.oak.commons.PathUtils;
@@ -29,30 +30,38 @@ import org.apache.jackrabbit.oak.spi.sta
import org.apache.jackrabbit.oak.spi.state.NodeState;
import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME;
-import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
+/**
+ * Allows to check which indexes can possibly be used for a certain query.
+ *
+ * For example, for a query of the form "/jcr:root/content//*", the indexes
+ * under "/" and the indexes under "/content" can be used.
+ */
public class IndexLookup {
+
private final NodeState root;
+ private final Predicate<NodeState> definitionPredicate;
- public IndexLookup(NodeState root) {
+ public IndexLookup(NodeState root, Predicate<NodeState> definitionPredicate) {
this.root = root;
+ this.definitionPredicate = definitionPredicate;
}
- public Collection<String> collectIndexNodePaths(Filter filter, String type){
- return collectIndexNodePaths(filter, type, true);
+ public Collection<String> collectIndexNodePaths(Filter filter) {
+ return collectIndexNodePaths(filter, true);
}
- private Collection<String> collectIndexNodePaths(Filter filter, String type, boolean recurse){
+ public Collection<String> collectIndexNodePaths(Filter filter, boolean recurse) {
Set<String> paths = Sets.newHashSet();
- collectIndexNodePaths(root, type, "/", paths);
+ collectIndexNodePaths(root, "/", paths);
if (recurse) {
StringBuilder sb = new StringBuilder();
NodeState nodeState = root;
for (String element : PathUtils.elements(filter.getPath())) {
nodeState = nodeState.getChildNode(element);
- collectIndexNodePaths(nodeState, type,
+ collectIndexNodePaths(nodeState,
sb.append("/").append(element).toString(),
paths);
}
@@ -61,20 +70,15 @@ public class IndexLookup {
return paths;
}
- public static void collectIndexNodePaths(NodeState nodeState, String type, String parentPath, Collection<String> paths) {
+ private void collectIndexNodePaths(NodeState nodeState, String parentPath, Collection<String> paths) {
NodeState state = nodeState.getChildNode(INDEX_DEFINITIONS_NAME);
for (ChildNodeEntry entry : state.getChildNodeEntries()) {
- if (isIndexOfType(entry.getNodeState(), type)) {
+ if (definitionPredicate.test(entry.getNodeState())) {
paths.add(createIndexNodePath(parentPath, entry.getName()));
}
}
}
- private static boolean isIndexOfType(NodeState nodeState, String type) {
- return type.equals(nodeState.getString(TYPE_PROPERTY_NAME));
- }
-
-
private static String createIndexNodePath(String parentPath, String name){
return PathUtils.concat(parentPath, INDEX_DEFINITIONS_NAME, name);
}
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexNode.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexNode.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexNode.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexNode.java Wed Sep 19 06:46:42 2018
@@ -19,6 +19,13 @@
package org.apache.jackrabbit.oak.plugins.index.search;
+/**
+ * Represents an instance of an index.
+ *
+ * It is typically acquired when in the planning phase and execution phase of a
+ * query, and released afterwards. This allows an implementation to re-use
+ * resources (eg. keep files open).
+ */
public interface IndexNode {
void release();
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexStatistics.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexStatistics.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexStatistics.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexStatistics.java Wed Sep 19 06:46:42 2018
@@ -19,7 +19,7 @@
package org.apache.jackrabbit.oak.plugins.index.search;
/**
- *
+ * Reports index statistics (for example, how many entries does the index contain).
*/
public interface IndexStatistics {
int numDocs();
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java Wed Sep 19 06:46:42 2018
@@ -24,9 +24,9 @@ import javax.jcr.PropertyType;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.plugins.index.property.ValuePattern;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition.IndexingRule;
import org.apache.jackrabbit.oak.plugins.index.search.util.FunctionIndexProcessor;
-import org.apache.jackrabbit.oak.plugins.index.property.ValuePattern;
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexHelper;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.jetbrains.annotations.Nullable;
@@ -40,6 +40,7 @@ import static org.apache.jackrabbit.oak.
import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.FIELD_BOOST;
import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_IS_REGEX;
import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_WEIGHT;
+import static org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner.DEFAULT_PROPERTY_WEIGHT;
import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue;
public class PropertyDefinition {
@@ -58,19 +59,19 @@ public class PropertyDefinition {
* property etc then it should be defined via 'name' property in NodeState.
* In such case NodeState name can be set to anything
*/
- final String name;
+ public final String name;
private final int propertyType;
/**
* The boost value for a property.
*/
- final float boost;
+ public final float boost;
- final boolean isRegexp;
+ public final boolean isRegexp;
public final boolean index;
- final boolean stored;
+ public final boolean stored;
public final boolean nodeScopeIndex;
@@ -82,7 +83,7 @@ public class PropertyDefinition {
public final boolean nullCheckEnabled;
- final boolean notNullCheckEnabled;
+ public final boolean notNullCheckEnabled;
final int includedPropertyTypes;
@@ -123,12 +124,14 @@ public class PropertyDefinition {
public final boolean unique;
+ public final boolean useInSimilarity;
+
public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState defn) {
this.isRegexp = getOptionalValue(defn, PROP_IS_REGEX, false);
this.name = getName(defn, nodeName);
this.relative = isRelativeProperty(name);
this.boost = getOptionalValue(defn, FIELD_BOOST, DEFAULT_BOOST);
- this.weight = getOptionalValue(defn, PROP_WEIGHT, 5);
+ this.weight = getOptionalValue(defn, PROP_WEIGHT, DEFAULT_PROPERTY_WEIGHT);
//By default if a property is defined it is indexed
this.index = getOptionalValue(defn, FulltextIndexConstants.PROP_INDEX, true);
@@ -151,6 +154,7 @@ public class PropertyDefinition {
this.propertyType = getPropertyType(idxDefn, nodeName, defn);
this.useInSuggest = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_USE_IN_SUGGEST, false);
this.useInSpellcheck = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_USE_IN_SPELLCHECK, false);
+ this.useInSimilarity = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_USE_IN_SIMILARITY, false);
this.nullCheckEnabled = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_NULL_CHECK_ENABLED, false);
this.notNullCheckEnabled = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_NOT_NULL_CHECK_ENABLED, false);
this.excludeFromAggregate = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_EXCLUDE_FROM_AGGREGATE, false);
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ReindexOperations.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ReindexOperations.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ReindexOperations.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ReindexOperations.java Wed Sep 19 06:46:42 2018
@@ -19,6 +19,7 @@
package org.apache.jackrabbit.oak.plugins.index.search;
+import org.apache.jackrabbit.oak.plugins.index.search.util.NodeStateCloner;
import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
import org.apache.jackrabbit.oak.spi.state.NodeState;
@@ -29,11 +30,14 @@ public class ReindexOperations {
private final NodeState root;
private final NodeBuilder definitionBuilder;
private final String indexPath;
+ private final IndexDefinition.Builder indexDefBuilder;
- public ReindexOperations(NodeState root, NodeBuilder definitionBuilder, String indexPath) {
+ public ReindexOperations(NodeState root, NodeBuilder definitionBuilder, String indexPath,
+ IndexDefinition.Builder indexDefBuilder) {
this.root = root;
this.definitionBuilder = definitionBuilder;
this.indexPath = indexPath;
+ this.indexDefBuilder = indexDefBuilder;
}
public IndexDefinition apply(boolean useStateFromBuilder) {
@@ -50,8 +54,10 @@ public class ReindexOperations {
String uid = configureUniqueId(definitionBuilder);
//Refresh the index definition based on update builder state
- return IndexDefinition
- .newBuilder(root, defnState, indexPath)
+ return indexDefBuilder
+ .root(root)
+ .defn(defnState)
+ .indexPath(indexPath)
.version(version)
.uid(uid)
.reindex()
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/SizeEstimator.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/SizeEstimator.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/SizeEstimator.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/SizeEstimator.java Wed Sep 19 06:46:42 2018
@@ -18,13 +18,17 @@
*/
package org.apache.jackrabbit.oak.plugins.index.search;
+/**
+ * A size estimator, for example to estimate how many entries a result will
+ * have.
+ */
public interface SizeEstimator {
/**
* Get the estimated size, or -1 if not known.
- *
+ *
* @return the size
*/
long getSize();
-
+
}
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/TextExtractionStatsMBean.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/TextExtractionStatsMBean.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/TextExtractionStatsMBean.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/TextExtractionStatsMBean.java Wed Sep 19 06:46:42 2018
@@ -21,6 +21,9 @@ package org.apache.jackrabbit.oak.plugin
import org.osgi.annotation.versioning.ProviderType;
+/**
+ * An MBean for text extraction statistics.
+ */
@ProviderType
public interface TextExtractionStatsMBean {
String TYPE = "TextExtractionStats";
Added: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/BlobByteSource.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/BlobByteSource.java?rev=1841291&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/BlobByteSource.java (added)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/BlobByteSource.java Wed Sep 19 06:46:42 2018
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.search.spi.binary;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.api.Blob;
+
+/**
+ * {@link ByteSource} extension to work with Oak {@link Blob}s
+ */
+public final class BlobByteSource extends ByteSource {
+ private final Blob blob;
+
+ public BlobByteSource(Blob blob) {
+ this.blob = blob;
+ }
+
+ @Override
+ public InputStream openStream() throws IOException {
+ return blob.getNewStream();
+ }
+
+ @Override
+ public long size() throws IOException {
+ return blob.length();
+ }
+
+ @Override
+ public boolean isEmpty() throws IOException {
+ return blob.length() == 0;
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/BlobByteSource.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java?rev=1841291&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java (added)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java Wed Sep 19 06:46:42 2018
@@ -0,0 +1,344 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.search.spi.binary;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.TimeoutException;
+
+import com.google.common.collect.Lists;
+import com.google.common.io.CountingInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.api.PropertyState;
+import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.search.ExtractedTextCache;
+import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditorContext;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.jetbrains.annotations.Nullable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
+import static org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditor.TEXT_EXTRACTION_ERROR;
+
+/**
+ *
+ */
+public class FulltextBinaryTextExtractor {
+
+ private static final Logger log = LoggerFactory.getLogger(FulltextBinaryTextExtractor.class);
+ private static final Parser defaultParser = createDefaultParser();
+ private static final long SMALL_BINARY = Long.getLong("oak.search.smallBinary", 16 * 1024);
+ private final TextExtractionStats textExtractionStats = new TextExtractionStats();
+ private final ExtractedTextCache extractedTextCache;
+ private final IndexDefinition definition;
+ private final boolean reindex;
+ private Parser parser;
+ private TikaConfigHolder tikaConfig;
+ /**
+ * The media types supported by the parser used.
+ */
+ private Set<MediaType> supportedMediaTypes;
+ private Set<MediaType> nonIndexedMediaType;
+
+ public FulltextBinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexDefinition definition, boolean reindex) {
+ this.extractedTextCache = extractedTextCache;
+ this.definition = definition;
+ this.reindex = reindex;
+ }
+
+ public void done(boolean reindex){
+ textExtractionStats.log(reindex);
+ textExtractionStats.collectStats(extractedTextCache);
+ }
+
+ public List<String> newBinary(
+ PropertyState property, NodeState state, String path) {
+ List<String> values = Lists.newArrayList();
+ Metadata metadata = new Metadata();
+
+ //jcr:mimeType is mandatory for a binary to be indexed
+ String type = state.getString(JcrConstants.JCR_MIMETYPE);
+ type = definition.getTikaMappedMimeType(type);
+
+ if (type == null || !isSupportedMediaType(type)) {
+ log.trace(
+ "[{}] Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]",
+ getIndexName(), path, type);
+ return values;
+ }
+
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ if (JCR_DATA.equals(property.getName())) {
+ String encoding = state.getString(JcrConstants.JCR_ENCODING);
+ if (encoding != null) { // not mandatory
+ metadata.set(Metadata.CONTENT_ENCODING, encoding);
+ }
+ }
+
+ for (Blob v : property.getValue(Type.BINARIES)) {
+ String value = parseStringValue(v, metadata, path, property.getName());
+ if (value == null){
+ continue;
+ }
+
+ values.add(value);
+ }
+ return values;
+ }
+
+ private String parseStringValue(Blob v, Metadata metadata, String path, String propertyName) {
+ String text = extractedTextCache.get(path, propertyName, v, reindex);
+ if (text == null){
+ text = parseStringValue0(v, metadata, path);
+ }
+ return text;
+ }
+
+ private String parseStringValue0(Blob v, Metadata metadata, String path) {
+ WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
+ long start = System.currentTimeMillis();
+ long bytesRead = 0;
+ long length = v.length();
+ if (log.isDebugEnabled()) {
+ log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
+ }
+ try {
+ CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
+ try {
+ if (length > SMALL_BINARY) {
+ String name = "Extracting " + path + ", " + length + " bytes";
+ extractedTextCache.process(name, new Callable<Void>() {
+ @Override
+ public Void call() throws Exception {
+ getParser().parse(stream, handler, metadata, new ParseContext());
+ return null;
+ }
+ });
+ } else {
+ getParser().parse(stream, handler, metadata, new ParseContext());
+ }
+ } finally {
+ bytesRead = stream.getCount();
+ stream.close();
+ }
+ } catch (LinkageError e) {
+ // Capture errors caused by extraction libraries
+ // not being present. This is equivalent to disabling
+ // selected media types in configuration, so we can simply
+ // ignore these errors.
+ log.debug(
+ "[{}] Failed to extract text from a binary property: {}."
+ + " This often happens when some media types are disabled by configuration."
+ + " The stack trace is included to flag some 'unintended' failures",
+ getIndexName(), path, e);
+ extractedTextCache.put(v, ExtractedText.ERROR);
+ return TEXT_EXTRACTION_ERROR;
+ } catch (TimeoutException t) {
+ log.warn(
+ "[{}] Failed to extract text from a binary property due to timeout: {}.",
+ getIndexName(), path);
+ extractedTextCache.put(v, ExtractedText.ERROR);
+ extractedTextCache.putTimeout(v, ExtractedText.ERROR);
+ return TEXT_EXTRACTION_ERROR;
+ } catch (Throwable t) {
+ // Capture and report any other full text extraction problems.
+ // The special STOP exception is used for normal termination.
+ if (!handler.isWriteLimitReached(t)) {
+ log.debug(
+ "[{}] Failed to extract text from a binary property: {}."
+ + " This is a fairly common case, and nothing to"
+ + " worry about. The stack trace is included to"
+ + " help improve the text extraction feature.",
+ getIndexName(), path, t);
+ extractedTextCache.put(v, ExtractedText.ERROR);
+ return TEXT_EXTRACTION_ERROR;
+ } else {
+ log.debug("Extracted text size exceeded configured limit({})", definition.getMaxExtractLength());
+ }
+ }
+ String result = handler.toString();
+ if (bytesRead > 0) {
+ long time = System.currentTimeMillis() - start;
+ int len = result.length();
+ recordTextExtractionStats(time, bytesRead, len);
+ if (log.isDebugEnabled()) {
+ log.debug("Extracting {} took {} ms, {} bytes read, {} text size",
+ path, time, bytesRead, len);
+ }
+ }
+ extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
+ return result;
+ }
+
+ private void recordTextExtractionStats(long timeInMillis, long bytesRead, int textLength) {
+ textExtractionStats.addStats(timeInMillis, bytesRead, textLength);
+ }
+
+ private String getIndexName() {
+ return definition.getIndexName();
+ }
+
+ //~-------------------------------------------< Tika >
+
+ public TikaConfig getTikaConfig(){
+ if (tikaConfig == null) {
+ tikaConfig = initializeTikaConfig(definition);
+ }
+ return tikaConfig.config;
+ }
+
+ private Parser getParser() {
+ if (parser == null){
+ parser = initializeTikaParser(definition);
+ }
+ return parser;
+ }
+
+ private boolean isSupportedMediaType(String type) {
+ if (supportedMediaTypes == null) {
+ supportedMediaTypes = getParser().getSupportedTypes(new ParseContext());
+ nonIndexedMediaType = getNonIndexedMediaTypes();
+ }
+ MediaType mediaType = MediaType.parse(type);
+ return supportedMediaTypes.contains(mediaType) && !nonIndexedMediaType.contains(mediaType);
+ }
+
+ private Set<MediaType> getNonIndexedMediaTypes() {
+ InputStream configStream = null;
+ String configSource = null;
+ try {
+ if (definition.hasCustomTikaConfig()) {
+ configSource = String.format("Custom config at %s", definition.getIndexPath());
+ configStream = definition.getTikaConfig();
+ } else {
+ URL configUrl = FulltextIndexEditorContext.class.getResource("tika-config.xml");
+ configSource = "Default : tika-config.xml";
+ if (configUrl != null) {
+ configStream = configUrl.openStream();
+ }
+ }
+
+ if (configStream != null) {
+ return TikaParserConfig.getNonIndexedMediaTypes(configStream);
+ }
+ } catch (TikaException | IOException | SAXException e) {
+ log.warn("Tika configuration not available : " + configSource, e);
+ } finally {
+ IOUtils.closeQuietly(configStream);
+ }
+ return Collections.emptySet();
+ }
+
+
+ private static TikaConfigHolder initializeTikaConfig(@Nullable IndexDefinition definition) {
+ ClassLoader current = Thread.currentThread().getContextClassLoader();
+ InputStream configStream = null;
+ String configSource = null;
+
+ try {
+ Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
+ if (definition != null && definition.hasCustomTikaConfig()) {
+ log.debug("[{}] Using custom tika config", definition.getIndexName());
+ configSource = "Custom config at " + definition.getIndexPath();
+ configStream = definition.getTikaConfig();
+ } else {
+ URL configUrl = FulltextIndexEditorContext.class.getResource("tika-config.xml");
+ if (configUrl != null) {
+ configSource = configUrl.toString();
+ configStream = configUrl.openStream();
+ }
+ }
+
+ if (configStream != null) {
+ return new TikaConfigHolder(new TikaConfig(configStream), configSource);
+ }
+ } catch (TikaException | IOException | SAXException e) {
+ log.warn("Tika configuration not available : " + configSource, e);
+ } finally {
+ IOUtils.closeQuietly(configStream);
+ Thread.currentThread().setContextClassLoader(current);
+ }
+ return new TikaConfigHolder(TikaConfig.getDefaultConfig(), "Default Config");
+ }
+
+ private Parser initializeTikaParser(IndexDefinition definition) {
+ ClassLoader current = Thread.currentThread().getContextClassLoader();
+ try {
+ if (definition.hasCustomTikaConfig()) {
+ Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
+ return new AutoDetectParser(getTikaConfig());
+ }
+ } finally {
+ Thread.currentThread().setContextClassLoader(current);
+ }
+ return defaultParser;
+ }
+
+ private static AutoDetectParser createDefaultParser() {
+ ClassLoader current = Thread.currentThread().getContextClassLoader();
+ TikaConfigHolder configHolder = null;
+ try {
+ configHolder = initializeTikaConfig(null);
+ Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
+ log.info("Loaded default Tika Config from classpath {}", configHolder);
+ return new AutoDetectParser(configHolder.config);
+ } catch (Exception e) {
+ log.warn("Tika configuration not available : " + configHolder, e);
+ } finally {
+ Thread.currentThread().setContextClassLoader(current);
+ }
+ return new AutoDetectParser();
+ }
+
+ private static final class TikaConfigHolder{
+ final TikaConfig config;
+ final String sourceInfo;
+
+ public TikaConfigHolder(TikaConfig config, String sourceInfo) {
+ this.config = config;
+ this.sourceInfo = sourceInfo;
+ }
+
+ @Override
+ public String toString() {
+ return sourceInfo;
+ }
+ }
+
+}
Propchange: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java?rev=1841291&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java (added)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java Wed Sep 19 06:46:42 2018
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.search.spi.binary;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.jackrabbit.oak.plugins.index.search.ExtractedTextCache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount;
+
+class TextExtractionStats {
+ private static final Logger log = LoggerFactory.getLogger(TextExtractionStats.class);
+ /**
+ * Log stats only if time spent is more than 1 min
+ */
+ private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(1);
+ private int count;
+ private long totalBytesRead;
+ private long totalTime;
+ private long totalTextLength;
+
+ public void addStats(long timeInMillis, long bytesRead, int textLength) {
+ count++;
+ totalBytesRead += bytesRead;
+ totalTime += timeInMillis;
+ totalTextLength += textLength;
+ }
+
+ public void log(boolean reindex) {
+ if (log.isDebugEnabled()) {
+ log.debug("Text extraction stats {}", this);
+ } else if (anyParsingDone() && (reindex || isTakingLotsOfTime())) {
+ log.info("Text extraction stats {}", this);
+ }
+ }
+
+ public void collectStats(ExtractedTextCache cache){
+ cache.addStats(count, totalTime, totalBytesRead, totalTextLength);
+ }
+
+ private boolean isTakingLotsOfTime() {
+ return totalTime > LOGGING_THRESHOLD;
+ }
+
+ private boolean anyParsingDone() {
+ return count > 0;
+ }
+
+ @Override
+ public String toString() {
+ return String.format(" %d (Time Taken %s, Bytes Read %s, Extracted text size %s)",
+ count,
+ timeInWords(totalTime),
+ humanReadableByteCount(totalBytesRead),
+ humanReadableByteCount(totalTextLength));
+ }
+
+ private static String timeInWords(long millis) {
+ return String.format("%d min, %d sec",
+ TimeUnit.MILLISECONDS.toMinutes(millis),
+ TimeUnit.MILLISECONDS.toSeconds(millis) -
+ TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))
+ );
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java?rev=1841291&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java (added)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java Wed Sep 19 06:46:42 2018
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.search.spi.binary;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+
+import com.google.common.base.Strings;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+public class TikaParserConfig {
+
+ private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser";
+
+ /**
+ * Determines the set of MediaType which have been configured with an EmptyParser.
+ *
+ * @param configStream stream for tika config
+ * @return set of MediaTypes which are not indexed
+ */
+ public static Set<MediaType> getNonIndexedMediaTypes(InputStream configStream) throws
+ TikaException, IOException, SAXException {
+ Set<MediaType> result = new HashSet<>();
+ Element element = getBuilder().parse(configStream).getDocumentElement();
+ NodeList nodes = element.getElementsByTagName("parsers");
+ if (nodes.getLength() == 1) {
+ Node parentNode = nodes.item(0);
+ NodeList parsersNodes = parentNode.getChildNodes();
+ for (int i = 0; i < parsersNodes.getLength(); i++) {
+ Node node = parsersNodes.item(i);
+ if (node instanceof Element) {
+ String className = ((Element) node).getAttribute("class");
+ if (EMPTY_PARSER.equals(className)) {
+ NodeList mimes = ((Element) node).getElementsByTagName("mime");
+ parseMimeTypes(result, mimes);
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+
+ private static void parseMimeTypes(Set<MediaType> result, NodeList mimes) {
+ /*
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>application/x-archive</mime>
+ <mime>application/x-bzip</mime>
+ <mime>application/x-bzip2</mime>
+ </parser>
+ */
+ for (int j = 0; j < mimes.getLength(); j++) {
+ Node mime = mimes.item(j);
+ if (mime instanceof Element) {
+ String mimeValue = mime.getTextContent();
+ mimeValue = Strings.emptyToNull(mimeValue);
+ if (mimeValue != null) {
+ MediaType mediaType = MediaType.parse(mimeValue.trim());
+ if (mediaType != null) {
+ result.add(mediaType);
+ }
+ }
+ }
+ }
+ }
+
+ private static DocumentBuilder getBuilder() throws TikaException {
+ return new ParseContext().getDocumentBuilder();
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java Wed Sep 19 06:46:42 2018
@@ -23,28 +23,30 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
-import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.jcr.PropertyType;
import com.google.common.collect.Iterables;
+import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
-import org.jetbrains.annotations.Nullable;
-
import org.apache.jackrabbit.oak.plugins.index.search.Aggregate;
+import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.FulltextBinaryTextExtractor;
import org.apache.jackrabbit.oak.plugins.index.search.util.FunctionIndexProcessor;
import org.apache.jackrabbit.oak.plugins.memory.StringPropertyState;
import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static com.google.common.base.Preconditions.checkNotNull;
import static org.apache.jackrabbit.oak.commons.PathUtils.getName;
-
import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getPrimaryTypeName;
/**
@@ -56,44 +58,52 @@ public abstract class FulltextDocumentMa
private final Logger log = LoggerFactory.getLogger(getClass());
- private FulltextBinaryTextExtractor textExtractor;
- private IndexDefinition definition;
- private IndexDefinition.IndexingRule indexingRule;
- private String path;
+ protected final FulltextBinaryTextExtractor textExtractor;
+ protected final IndexDefinition definition;
+ protected final IndexDefinition.IndexingRule indexingRule;
+ protected final String path;
+
+ public FulltextDocumentMaker(@Nullable FulltextBinaryTextExtractor textExtractor,
+ @NotNull IndexDefinition definition,
+ @NotNull IndexDefinition.IndexingRule indexingRule,
+ @NotNull String path) {
+ this.textExtractor = textExtractor;
+ this.definition = checkNotNull(definition);
+ this.indexingRule = checkNotNull(indexingRule);
+ this.path = checkNotNull(path);
+ }
protected abstract D initDoc();
- protected abstract D finalizeDoc(D fields, boolean dirty, boolean facet);
-
- protected abstract StringPropertyState createNodeNamePS();
+ protected abstract D finalizeDoc(D fields, boolean dirty, boolean facet) throws IOException;
protected abstract boolean isFacetingEnabled();
- protected abstract boolean isNodeName(String pname);
-
- protected abstract boolean indexTypeOrderedFields(String pname, int tag, PropertyState property, PropertyDefinition pd);
+ protected abstract boolean indexTypeOrderedFields(D doc, String pname, int tag, PropertyState property, PropertyDefinition pd);
- protected abstract boolean addBinary(D doc, Map<String, String> binaryMap);
+ protected abstract boolean addBinary(D doc, String path, List<String> binaryValues);
protected abstract boolean indexFacetProperty(D doc, int tag, PropertyState property, String pname);
- protected abstract boolean indexAnalyzedProperty(D doc, String pname, String value, PropertyDefinition pd);
+ protected abstract void indexAnalyzedProperty(D doc, String pname, String value, PropertyDefinition pd);
- protected abstract boolean indexSuggestValue(D doc, String value);
+ protected abstract void indexSuggestValue(D doc, String value);
- protected abstract boolean indexSpellcheckValue(D doc, String value);
+ protected abstract void indexSpellcheckValue(D doc, String value);
- protected abstract boolean indexFulltextValue(D doc, String value);
+ protected abstract void indexFulltextValue(D doc, String value);
protected abstract boolean indexTypedProperty(D doc, PropertyState property, String pname, PropertyDefinition pd);
- protected abstract boolean indexNotNullProperty(D doc, PropertyDefinition pd);
+ protected abstract void indexAncestors(D doc, String path);
- protected abstract boolean indexNullProperty(D doc, PropertyDefinition pd);
+ protected abstract void indexNotNullProperty(D doc, PropertyDefinition pd);
- protected abstract boolean indexAggregateValue(D doc, Aggregate.NodeIncludeResult result, String value, PropertyDefinition pd);
+ protected abstract void indexNullProperty(D doc, PropertyDefinition pd);
- protected abstract boolean indexNodeName(D doc, String value);
+ protected abstract void indexAggregateValue(D doc, Aggregate.NodeIncludeResult result, String value, PropertyDefinition pd);
+
+ protected abstract void indexNodeName(D doc, String value);
@Nullable
public D makeDocument(NodeState state) throws IOException {
@@ -109,11 +119,12 @@ public abstract class FulltextDocumentMa
//We 'intentionally' are indexing node names only on root state as we don't support indexing relative or
//regex for node name indexing
- PropertyState nodenamePS = createNodeNamePS();
+ PropertyState nodenamePS =
+ new StringPropertyState(FieldNames.NODE_NAME, getName(path));
for (PropertyState property : Iterables.concat(state.getProperties(), Collections.singleton(nodenamePS))) {
String pname = property.getName();
- if (!isVisible(pname) && !isNodeName(pname)) {
+ if (!isVisible(pname) && !FieldNames.NODE_NAME.equals(pname)) {
continue;
}
@@ -163,6 +174,14 @@ public abstract class FulltextDocumentMa
return null;
}
+ if (indexingRule.isFulltextEnabled()) {
+ indexFulltextValue(document, name);
+ }
+
+ if (definition.evaluatePathRestrictions()){
+ indexAncestors(document, path);
+ }
+
return finalizeDoc(document, dirty, facet);
}
@@ -191,10 +210,18 @@ public abstract class FulltextDocumentMa
boolean includeTypeForFullText = indexingRule.includePropertyType(property.getType().tag());
boolean dirty = false;
- if (Type.BINARY.tag() == property.getType().tag()
+ if (Type.BINARY.tag() == property.getType().tag() && pd.useInSimilarity) {
+ try {
+ log.trace("indexing similarity binaries for {}", pd.name);
+ indexSimilarityBinaries(doc, pd, property.getValue(Type.BINARY));
+ dirty = true;
+ } catch (Exception e) {
+ log.error("could not index similarity field for property {} and definition {}", property, pd);
+ }
+ } else if (Type.BINARY.tag() == property.getType().tag()
&& includeTypeForFullText) {
- Map<String, String> binaryMap = newBinary(property, state, null, path + "@" + pname);
- addBinary(doc, binaryMap);
+ List<String> binaryValues = newBinary(property, state, path + "@" + pname);
+ addBinary(doc, null, binaryValues);
dirty = true;
} else {
if (pd.propertyIndex && pd.includePropertyType(property.getType().tag())) {
@@ -222,6 +249,15 @@ public abstract class FulltextDocumentMa
if (pd.nodeScopeIndex) {
indexFulltextValue(doc, value);
+ if (pd.useInSimilarity) {
+ log.trace("indexing similarity strings for {}", pd.name);
+ try {
+ // fallback for when feature vectors are written in string typed properties
+ indexSimilarityStrings(doc, pd, value);
+ } catch (Exception e) {
+ log.error("could not index similarity field for property {} and definition {}", property, pd);
+ }
+ }
}
dirty = true;
}
@@ -235,6 +271,10 @@ public abstract class FulltextDocumentMa
return dirty;
}
+ protected abstract void indexSimilarityBinaries(D doc, PropertyDefinition pd, Blob blob) throws IOException;
+
+ protected abstract void indexSimilarityStrings(D doc, PropertyDefinition pd, String value) throws IOException;
+
private boolean addTypedFields(D doc, PropertyState property, String pname, PropertyDefinition pd) {
return indexTypedProperty(doc, property, pname, pd);
}
@@ -264,7 +304,7 @@ public abstract class FulltextDocumentMa
Type.fromTag(tag, false), path);
tag = idxDefinedTag;
}
- return indexTypeOrderedFields(pname, tag, property, pd);
+ return indexTypeOrderedFields(doc, pname, tag, property, pd);
}
protected boolean includePropertyValue(PropertyState property, int i, PropertyDefinition pd) {
@@ -287,22 +327,20 @@ public abstract class FulltextDocumentMa
return name.charAt(0) != ':';
}
- private Map<String,String> newBinary(
- PropertyState property, NodeState state, String nodePath, String path) {
+ private List<String> newBinary(
+ PropertyState property, NodeState state, String path) {
if (textExtractor == null){
//Skip text extraction for sync indexing
- return Collections.emptyMap();
+ return Collections.emptyList();
}
- return textExtractor.newBinary(property, state, nodePath, path);
+ return textExtractor.newBinary(property, state, path);
}
- private boolean augmentCustomFields(final String path, final D doc,
- final NodeState document) {
- boolean dirty = false;
-
- // TODO : extract more generic SPI for augmentor factory
-
+ // TODO : extract more generic SPI for augmentor factory
+ protected abstract boolean augmentCustomFields(final String path, final D doc, final NodeState document);// {
+// boolean dirty = false;
+//
// if (augmentorFactory != null) {
// Iterable<Field> augmentedFields = augmentorFactory
// .getIndexFieldProvider(indexingRule.getNodeTypeName())
@@ -313,9 +351,9 @@ public abstract class FulltextDocumentMa
// dirty = true;
// }
// }
-
- return dirty;
- }
+//
+// return dirty;
+// }
//~-------------------------------------------------------< NullCheck Support >
@@ -323,7 +361,8 @@ public abstract class FulltextDocumentMa
boolean fieldAdded = false;
for (PropertyDefinition pd : indexingRule.getNotNullCheckEnabledProperties()) {
if (isPropertyNotNull(state, pd)) {
- fieldAdded = indexNotNullProperty(doc, pd);
+ indexNotNullProperty(doc, pd);
+ fieldAdded = true;
}
}
return fieldAdded;
@@ -334,7 +373,8 @@ public abstract class FulltextDocumentMa
boolean fieldAdded = false;
for (PropertyDefinition pd : indexingRule.getNullCheckEnabledProperties()) {
if (isPropertyNull(state, pd)) {
- fieldAdded = indexNullProperty(doc, pd);
+ indexNullProperty(doc, pd);
+ fieldAdded = true;
}
}
return fieldAdded;
@@ -513,8 +553,8 @@ public abstract class FulltextDocumentMa
//Here the fulltext is being created for aggregate root hence nodePath passed
//should be null
String nodePath = result.isRelativeNode() ? result.rootIncludePath : null;
- Map<String, String> stringStringMap = newBinary(property, result.nodeState, nodePath, aggreagtedNodePath + "@" + pname);
- addBinary(doc, stringStringMap);
+ List<String> binaryValues = newBinary(property, result.nodeState, aggreagtedNodePath + "@" + pname);
+ addBinary(doc, nodePath, binaryValues);
dirty = true;
} else {
PropertyDefinition pd = null;
@@ -527,14 +567,15 @@ public abstract class FulltextDocumentMa
}
for (String value : property.getValue(Type.STRINGS)) {
- dirty = indexAggregateValue(doc, result, value, pd);
+ indexAggregateValue(doc, result, value, pd);
+ dirty = true;
}
}
}
return dirty;
}
- private String getIndexName() {
+ protected String getIndexName() {
return definition.getIndexName();
}
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextIndexEditor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextIndexEditor.java?rev=1841291&r1=1841290&r2=1841291&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextIndexEditor.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextIndexEditor.java Wed Sep 19 06:46:42 2018
@@ -66,7 +66,7 @@ public class FulltextIndexEditor<D> impl
private boolean propertiesChanged = false;
- private List<PropertyState> propertiesModified = Lists.newArrayList();
+ private final List<PropertyState> propertiesModified = Lists.newArrayList();
/**
* Flag indicating if the current tree being traversed has a deleted parent.
@@ -81,7 +81,7 @@ public class FulltextIndexEditor<D> impl
private final PathFilter.Result pathFilterResult;
- FulltextIndexEditor(FulltextIndexEditorContext<D> context) throws CommitFailedException {
+ public FulltextIndexEditor(FulltextIndexEditorContext<D> context) throws CommitFailedException {
this.parent = null;
this.name = null;
this.path = "/";
@@ -91,10 +91,10 @@ public class FulltextIndexEditor<D> impl
this.pathFilterResult = context.getDefinition().getPathFilter().filter(PathUtils.ROOT_PATH);
}
- private FulltextIndexEditor(FulltextIndexEditor<D> parent, String name,
- MatcherState matcherState,
- PathFilter.Result pathFilterResult,
- boolean isDeleted) {
+ public FulltextIndexEditor(FulltextIndexEditor<D> parent, String name,
+ MatcherState matcherState,
+ PathFilter.Result pathFilterResult,
+ boolean isDeleted) {
this.parent = parent;
this.name = name;
this.path = null;