You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by to...@apache.org on 2016/06/21 07:59:25 UTC
svn commit: r1749459 - in /jackrabbit/oak/branches/1.4: ./
oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/
oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/
Author: tommaso
Date: Tue Jun 21 07:59:25 2016
New Revision: 1749459
URL: http://svn.apache.org/viewvc?rev=1749459&view=rev
Log:
OAK-4368 - use postings highlighter whenever possible in Lucene property index to speedup excerpt generation
Modified:
jackrabbit/oak/branches/1.4/ (props changed)
jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java
jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
jackrabbit/oak/branches/1.4/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
Propchange: jackrabbit/oak/branches/1.4/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jun 21 07:59:25 2016
@@ -1,3 +1,3 @@
/jackrabbit/oak/branches/1.0:1665962
-/jackrabbit/oak/trunk:1733615,1733875,1733913,1733929,1734230,1734254,1734279,1734941,1735052,1735405,1735484,1735549,1735564,1735588,1735622,1735638,1735919,1735983,1736176,1737309-1737310,1737334,1737349,1737998,1738004,1738775,1738795,1738833,1738950,1738957,1738963,1739894,1740116,1740625-1740626,1740971,1741032,1741339,1741343,1742520,1742888,1742916,1743097,1743172,1743343,1744265,1744959,1745038,1745197,1745368,1746086,1746117,1746342,1746345,1746696,1746981,1747492,1748553,1748870
+/jackrabbit/oak/trunk:1733615,1733875,1733913,1733929,1734230,1734254,1734279,1734941,1735052,1735405,1735484,1735549,1735564,1735588,1735622,1735638,1735919,1735983,1736176,1737309-1737310,1737334,1737349,1737998,1738004,1738775,1738795,1738833,1738950,1738957,1738963,1739894,1740116,1740625-1740626,1740971,1741032,1741339,1741343,1742520,1742888,1742916,1743097,1743172,1743343,1744265,1744959,1745038,1745197,1745368,1746086,1746117,1746342,1746345,1746696,1746981,1747492,1748505,1748553,1748870
/jackrabbit/trunk:1345480
Modified: jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java?rev=1749459&r1=1749458&r2=1749459&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (original)
+++ jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java Tue Jun 21 07:59:25 2016
@@ -34,6 +34,7 @@ import static org.apache.jackrabbit.oak.
import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.FULLTEXT;
import static org.apache.lucene.document.Field.Store.NO;
import static org.apache.lucene.document.Field.Store.YES;
+import static org.apache.lucene.index.FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
/**
* {@code FieldFactory} is a factory for <code>Field</code> instances with
@@ -59,7 +60,7 @@ public final class FieldFactory {
OAK_TYPE.setIndexed(true);
OAK_TYPE.setOmitNorms(true);
OAK_TYPE.setStored(true);
- OAK_TYPE.setIndexOptions(DOCS_AND_FREQS_AND_POSITIONS);
+ OAK_TYPE.setIndexOptions(DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
OAK_TYPE.setTokenized(true);
OAK_TYPE.freeze();
Modified: jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java?rev=1749459&r1=1749458&r2=1749459&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java (original)
+++ jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java Tue Jun 21 07:59:25 2016
@@ -58,7 +58,6 @@ import org.apache.jackrabbit.oak.spi.que
import org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvanceFulltextQueryIndex;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -369,10 +368,20 @@ public class LuceneIndex implements Adva
PropertyRestriction restriction = filter.getPropertyRestriction(QueryImpl.REP_EXCERPT);
boolean addExcerpt = restriction != null && restriction.isNotNullRestriction();
+
+ Analyzer analyzer = indexNode.getDefinition().getAnalyzer();
+
+ if (addExcerpt) {
+ // setup highlighter
+ QueryScorer scorer = new QueryScorer(query);
+ scorer.setExpandMultiTermQuery(true);
+ highlighter.setFragmentScorer(scorer);
+ }
+
for (ScoreDoc doc : docs.scoreDocs) {
String excerpt = null;
if (addExcerpt) {
- excerpt = getExcerpt(indexNode, searcher, query, doc);
+ excerpt = getExcerpt(analyzer, searcher, doc);
}
LuceneResultRow row = convertToRow(doc, searcher, excerpt);
@@ -487,20 +496,17 @@ public class LuceneIndex implements Adva
return new LucenePathCursor(itr, settings, sizeEstimator);
}
- private String getExcerpt(IndexNode indexNode, IndexSearcher searcher, Query query, ScoreDoc doc) throws IOException {
+ private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException {
StringBuilder excerpt = new StringBuilder();
- QueryScorer scorer = new QueryScorer(query);
- scorer.setExpandMultiTermQuery(true);
- highlighter.setFragmentScorer(scorer);
- Analyzer analyzer = indexNode.getDefinition().getAnalyzer();
- for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields())
- if (!FieldNames.SUGGEST.equals(field.name())) {
+ for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
+ String name = field.name();
+ // only full text or analyzed fields
+ if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
+ String text = field.stringValue();
+ TokenStream tokenStream = analyzer.tokenStream(name, text);
try {
- TokenStream tokenStream = analyzer.tokenStream(field.name(), field.stringValue());
- tokenStream.reset();
- CachingTokenFilter cachingTokenFilter = new CachingTokenFilter(tokenStream);
- TextFragment[] textFragments = highlighter.getBestTextFragments(cachingTokenFilter, field.stringValue(), true, 2);
+ TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 2);
if (textFragments != null && textFragments.length > 0) {
for (TextFragment fragment : textFragments) {
if (excerpt.length() > 0) {
@@ -508,11 +514,13 @@ public class LuceneIndex implements Adva
}
excerpt.append(fragment.toString());
}
+ break;
}
} catch (InvalidTokenOffsetsException e) {
LOG.error("higlighting failed", e);
}
}
+ }
return excerpt.toString();
}
Modified: jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1749459&r1=1749458&r2=1749459&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (original)
+++ jackrabbit/oak/branches/1.4/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java Tue Jun 21 07:59:25 2016
@@ -27,7 +27,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
@@ -75,24 +74,17 @@ import org.apache.jackrabbit.oak.spi.que
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.jackrabbit.oak.util.PerfLogger;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.Facets;
-import org.apache.lucene.facet.FacetsCollector;
-import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
-import org.apache.lucene.facet.MultiFacets;
-import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
-import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts;
-import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CustomScoreQuery;
@@ -123,16 +115,15 @@ import org.apache.lucene.search.highligh
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
+import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
-import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Lists.newArrayListWithCapacity;
import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
@@ -140,8 +131,8 @@ import static org.apache.jackrabbit.oak.
import static org.apache.jackrabbit.oak.api.Type.STRING;
import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
import static org.apache.jackrabbit.oak.commons.PathUtils.getParentPath;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.ANALYZED_FIELD_PREFIX;
import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.PATH;
-import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.SUGGEST;
import static org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.NATIVE_SORT_ORDER;
import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.VERSION;
import static org.apache.jackrabbit.oak.plugins.index.lucene.TermFactory.newAncestorTerm;
@@ -214,6 +205,8 @@ public class LucenePropertyIndex impleme
private final Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<strong>", "</strong>"),
new SimpleHTMLEncoder(), null);
+ private final PostingsHighlighter postingsHighlighter = new PostingsHighlighter();
+
private final IndexAugmentorFactory augmentorFactory;
public LucenePropertyIndex(IndexTracker tracker) {
@@ -421,10 +414,21 @@ public class LucenePropertyIndex impleme
restriction = filter.getPropertyRestriction(QueryImpl.OAK_SCORE_EXPLANATION);
boolean addExplain = restriction != null && restriction.isNotNullRestriction();
+ Analyzer analyzer = indexNode.getDefinition().getAnalyzer();
+
+ FieldInfos mergedFieldInfos = null;
+ if (addExcerpt) {
+ // setup highlighter
+ QueryScorer scorer = new QueryScorer(query);
+ scorer.setExpandMultiTermQuery(true);
+ highlighter.setFragmentScorer(scorer);
+ mergedFieldInfos = MultiFields.getMergedFieldInfos(searcher.getIndexReader());
+ }
+
for (ScoreDoc doc : docs.scoreDocs) {
String excerpt = null;
if (addExcerpt) {
- excerpt = getExcerpt(indexNode, searcher, query, doc);
+ excerpt = getExcerpt(query, analyzer, searcher, doc, mergedFieldInfos);
}
String explanation = null;
@@ -576,32 +580,69 @@ public class LucenePropertyIndex impleme
return query;
}
- private String getExcerpt(IndexNode indexNode, IndexSearcher searcher, Query query, ScoreDoc doc) throws IOException {
+ private String getExcerpt(Query query, Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc,
+ FieldInfos fieldInfos) throws IOException {
StringBuilder excerpt = new StringBuilder();
- QueryScorer scorer = new QueryScorer(query);
- scorer.setExpandMultiTermQuery(true);
- highlighter.setFragmentScorer(scorer);
-
- Analyzer analyzer = indexNode.getDefinition().getAnalyzer();
- for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields())
- if (!SUGGEST.equals(field.name())) {
- try {
- TokenStream tokenStream = analyzer.tokenStream(field.name(), field.stringValue());
- tokenStream.reset();
- CachingTokenFilter cachingTokenFilter = new CachingTokenFilter(tokenStream);
- TextFragment[] textFragments = highlighter.getBestTextFragments(cachingTokenFilter, field.stringValue(), true, 2);
- if (textFragments != null && textFragments.length > 0) {
- for (TextFragment fragment : textFragments) {
- if (excerpt.length() > 0) {
- excerpt.append("...");
+ int docID = doc.doc;
+ List<String> names = new LinkedList<String>();
+
+ for (IndexableField field : searcher.getIndexReader().document(docID).getFields()) {
+ String name = field.name();
+ // postings highlighter can be used on analyzed fields with docs, freqs, positions and offsets stored.
+ if (name.startsWith(ANALYZED_FIELD_PREFIX) && fieldInfos.hasProx() && fieldInfos.hasOffsets()) {
+ names.add(name);
+ }
+ }
+
+ if (names.size() > 0) {
+ int[] maxPassages = new int[names.size()];
+ for (int i = 0; i < maxPassages.length; i++) {
+ maxPassages[i] = 1;
+ }
+ try {
+ Map<String, String[]> stringMap = postingsHighlighter.highlightFields(names.toArray(new String[names.size()]),
+ query, searcher, new int[]{docID}, maxPassages);
+ for (Map.Entry<String, String[]> entry : stringMap.entrySet()) {
+ String value = Arrays.toString(entry.getValue());
+ if (value.contains("<b>")) {
+ if (excerpt.length() > 0) {
+ excerpt.append("...");
+ }
+ excerpt.append(value);
+ }
+ }
+ } catch (Exception e) {
+ LOG.error("postings highlighting failed", e);
+ }
+ }
+
+ // fallback if no excerpt could be retrieved using postings highlighter
+ if (excerpt.length() == 0) {
+
+ for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
+ String name = field.name();
+ // only full text or analyzed fields
+ if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
+ String text = field.stringValue();
+ TokenStream tokenStream = analyzer.tokenStream(name, text);
+
+ try {
+ TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 1);
+ if (textFragments != null && textFragments.length > 0) {
+ for (TextFragment fragment : textFragments) {
+ if (excerpt.length() > 0) {
+ excerpt.append("...");
+ }
+ excerpt.append(fragment.toString());
}
- excerpt.append(fragment.toString());
+ break;
}
+ } catch (InvalidTokenOffsetsException e) {
+ LOG.error("higlighting failed", e);
}
- } catch (InvalidTokenOffsetsException e) {
- LOG.error("higlighting failed", e);
}
}
+ }
return excerpt.toString();
}
Modified: jackrabbit/oak/branches/1.4/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.4/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1749459&r1=1749458&r2=1749459&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.4/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/branches/1.4/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Tue Jun 21 07:59:25 2016
@@ -24,6 +24,7 @@ import java.io.IOException;
import java.text.ParseException;
import java.util.Calendar;
import java.util.Collections;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
@@ -653,7 +654,7 @@ public class LucenePropertyIndexTest ext
t.setProperty(JcrConstants.JCR_PRIMARYTYPE, typeName, Type.NAME);
return t;
}
-
+
@Test
public void orderByScore() throws Exception {
Tree idx = createIndex("test1", of("propa"));
@@ -1895,7 +1896,7 @@ public class LucenePropertyIndexTest ext
assertThat(explain(propabQuery), containsString("lucene:test1(/oak:index/test1)"));
assertQuery(propabQuery, asList("/test/a"));
}
-
+
@Test
public void indexingPropertyWithAnalyzeButQueryWithWildcard() throws Exception {
Tree index = root.getTree("/");
@@ -1913,22 +1914,22 @@ public class LucenePropertyIndexTest ext
prop.setProperty(LuceneIndexConstants.PROP_PROPERTY_INDEX, true);
prop.setProperty(LuceneIndexConstants.PROP_ANALYZED, true);
root.commit();
-
+
Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("jcr:mimeType", "1234");
test.addChild("b").setProperty("other", "1234");
test.addChild("c").setProperty("jcr:mimeType", "a");
- root.commit();
-
+ root.commit();
+
String query;
-
+
query = "/jcr:root/test//*[jcr:contains(@jcr:mimeType, '1234')]";
assertThat(explainXpath(query), containsString("lucene:test2(/oak:index/test2)"));
assertQuery(query, "xpath", asList("/test/a"));
query = "/jcr:root/test//*[jcr:contains(., '1234')]";
assertThat(explainXpath(query), containsString("no-index"));
-
+
query = "/jcr:root/test//*[@jcr:mimeType = '1234']";
assertThat(explainXpath(query), containsString("lucene:test2(/oak:index/test2)"));
assertQuery(query, "xpath", asList("/test/a"));
@@ -2082,6 +2083,36 @@ public class LucenePropertyIndexTest ext
}
+ @Test
+ public void longRepExcerpt() throws Exception {
+ Tree luceneIndex = createFullTextIndex(root.getTree("/"), "lucene");
+
+ root.commit();
+
+ StringBuilder s = new StringBuilder();
+ for (int k = 0; k < 1000; k++) {
+ s.append("foo bar ").append(k).append(" ");
+ }
+ String text = s.toString();
+ List<String> names = new LinkedList<String>();
+ for (int j = 0; j < 30; j++) {
+ Tree test = root.getTree("/").addChild("ex-test-" + j);
+ for (int i = 0; i < 200; i++) {
+ String name = "cont" + i;
+ test.addChild(name).setProperty("text", text);
+ names.add("/" + test.getName() + "/" + name);
+ }
+ }
+
+ root.commit();
+
+ String query;
+
+ query = "SELECT [jcr:path],[rep:excerpt] from [nt:base] WHERE CONTAINS([text], 'foo')";
+ assertQuery(query, SQL2, names);
+
+ }
+
@Test
public void emptySuggestDictionary() throws Exception{
Tree idx = createIndex("test1", of("propa", "propb"));