You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by al...@apache.org on 2011/09/21 16:55:18 UTC
svn commit: r1173694 - in /jackrabbit/trunk/jackrabbit-core/src:
main/java/org/apache/jackrabbit/core/query/lucene/
test/java/org/apache/jackrabbit/core/query/
Author: alexparvulescu
Date: Wed Sep 21 14:55:18 2011
New Revision: 1173694
URL: http://svn.apache.org/viewvc?rev=1173694&view=rev
Log:
JCR-3075 incorrect HTML excerpt generation for queries on japanese text content
Modified:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java
jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java?rev=1173694&r1=1173693&r2=1173694&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java Wed Sep 21 14:55:18 2011
@@ -16,30 +16,34 @@
*/
package org.apache.jackrabbit.core.query.lucene;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.lucene.search.Query;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.jackrabbit.core.id.NodeId;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.jackrabbit.core.id.NodeId;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.io.Reader;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.TreeMap;
-import java.util.SortedMap;
-import java.util.Arrays;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* <code>AbstractExcerpt</code> implements base functionality for an excerpt
@@ -177,10 +181,34 @@ public abstract class AbstractExcerpt im
/**
* @return the extracted terms from the query.
*/
- protected final Set<Term> getQueryTerms() {
- Set<Term> extractedTerms = new HashSet<Term>();
- Set<Term> relevantTerms = new HashSet<Term>();
- query.extractTerms(extractedTerms);
+ protected final Set<Term[]> getQueryTerms() {
+ Set<Term[]> relevantTerms = new HashSet<Term[]>();
+ getQueryTerms(query, relevantTerms);
+ return relevantTerms;
+ }
+
+ private static void getQueryTerms(Query q, Set<Term[]> relevantTerms) {
+ if (q instanceof BooleanQuery) {
+ final BooleanQuery bq = (BooleanQuery) q;
+ Iterator<BooleanClause> clIterator = bq.iterator();
+ while (clIterator.hasNext()) {
+ BooleanClause clause = clIterator.next();
+ getQueryTerms(clause.getQuery(), relevantTerms);
+ }
+ return;
+ }
+ //need to preserve insertion order
+ Set<Term> extractedTerms = new LinkedHashSet<Term>();
+ q.extractTerms(extractedTerms);
+ Set<Term> filteredTerms = filterRelevantTerms(extractedTerms);
+ if (!filteredTerms.isEmpty()) {
+ relevantTerms.add(filteredTerms.toArray(new Term[] {}));
+ }
+ }
+
+ private static Set<Term> filterRelevantTerms(Set<Term> extractedTerms) {
+ //need to preserve insertion order
+ Set<Term> relevantTerms = new LinkedHashSet<Term>();
// only keep terms for fulltext fields
for (Term t : extractedTerms) {
if (t.field().equals(FieldNames.FULLTEXT)) {
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java?rev=1173694&r1=1173693&r2=1173694&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java Wed Sep 21 14:55:18 2011
@@ -19,17 +19,19 @@ package org.apache.jackrabbit.core.query
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.Set;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.jackrabbit.util.Text;
import org.apache.lucene.document.Field;
+import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
-import org.apache.lucene.index.Term;
-import org.apache.jackrabbit.util.Text;
/**
* This is an adapted version of the <code>FulltextHighlighter</code> posted in
@@ -93,7 +95,7 @@ public class DefaultHighlighter {
* highlighted
*/
public static String highlight(TermPositionVector tvec,
- Set<Term> queryTerms,
+ Set<Term[]> queryTerms,
String text,
String excerptStart,
String excerptEnd,
@@ -120,7 +122,7 @@ public class DefaultHighlighter {
* highlighted
*/
public static String highlight(TermPositionVector tvec,
- Set<Term> queryTerms,
+ Set<Term[]> queryTerms,
String text,
int maxFragments,
int surround)
@@ -134,7 +136,7 @@ public class DefaultHighlighter {
* @see #highlight(TermPositionVector, Set, String, String, String, String, String, String, String, int, int)
*/
protected String doHighlight(TermPositionVector tvec,
- Set<Term> queryTerms,
+ Set<Term[]> queryTerms,
String text,
String excerptStart,
String excerptEnd,
@@ -144,21 +146,102 @@ public class DefaultHighlighter {
String hlEnd,
int maxFragments,
int surround) throws IOException {
- String[] terms = new String[queryTerms.size()];
- Iterator<Term> it = queryTerms.iterator();
- for (int i = 0; it.hasNext(); i++) {
- terms[i] = it.next().text();
- }
- List<TermVectorOffsetInfo> list = new ArrayList<TermVectorOffsetInfo>();
- int[] tvecindexes = tvec.indexesOf(terms, 0, terms.length);
- for (int tvecindex : tvecindexes) {
- TermVectorOffsetInfo[] termoffsets = tvec.getOffsets(tvecindex);
- list.addAll(Arrays.asList(termoffsets));
+
+ List<TermVectorOffsetInfo> termOffsetInfo = new ArrayList<TermVectorOffsetInfo>();
+
+ Iterator<Term[]> it = queryTerms.iterator();
+ while (it.hasNext()) {
+ Term[] qt = it.next();
+ final int qtLen = qt.length;
+ if (qt == null || qtLen == 0) {
+ continue;
+ }
+ String[] qtText = new String[qtLen];
+ for (int i = 0; i < qtLen; i++) {
+ qtText[i] = qt[i].text();
+ }
+ int[] tvecindexes = tvec.indexesOf(qtText, 0, qtText.length);
+ Map<Integer, TermVectorOffsetInfo[]> localTermOffsetInfo = new HashMap<Integer, TermVectorOffsetInfo[]>();
+ for (int tvecindex : tvecindexes) {
+ TermVectorOffsetInfo[] termoffsets = tvec.getOffsets(tvecindex);
+ if (termoffsets == null || termoffsets.length == 0) {
+ continue;
+ }
+ localTermOffsetInfo.put(tvecindex, termoffsets);
+ }
+
+ // to keep the order of the keys, use tvecindexes,
+ // if a term is not found tvecindexes[] = -1
+ // when dealing with multiple terms that have to exist, just check
+ // if the first one is there
+ if (tvecindexes.length > 0 && tvecindexes[0] >= 0) {
+ // we have to build one interval TermVectorOffsetInfo for each
+ // hit;
+ List<TermVectorOffsetInfo> intervalTermOffsetInfo = new ArrayList<TermVectorOffsetInfo>();
+
+ // pick all the first key's hist as interval start
+ TermVectorOffsetInfo[] firstKeyTermOffsets = localTermOffsetInfo
+ .get(tvecindexes[0]);
+ Arrays.sort(firstKeyTermOffsets,
+ new TermVectorOffsetInfoSorter());
+ intervalTermOffsetInfo.addAll(Arrays
+ .asList(firstKeyTermOffsets));
+
+ // check if each key is part of an interval, if not, it is
+ // dropped from the list
+ for (int i = 1; i < tvecindexes.length; i++) {
+ final Integer key = tvecindexes[i];
+ TermVectorOffsetInfo[] termoffsets = localTermOffsetInfo
+ .get(key);
+ if (termoffsets == null) {
+ continue;
+ }
+ Arrays.sort(termoffsets, new TermVectorOffsetInfoSorter());
+
+ Iterator<TermVectorOffsetInfo> intervalIterator = intervalTermOffsetInfo
+ .iterator();
+
+ int index = 0;
+ while (intervalIterator.hasNext()) {
+ TermVectorOffsetInfo intervalOI = intervalIterator
+ .next();
+ if (index >= termoffsets.length) {
+ intervalIterator.remove();
+ continue;
+ }
+ boolean matchSearch = true;
+ boolean matchFound = false;
+ while (matchSearch) {
+ TermVectorOffsetInfo localOI = termoffsets[index];
+ // check interval match
+ // CJK languages will have the tokens from the PhraseQuery glued together (see LUCENE-2458)
+ int diff = localOI.getStartOffset()
+ - intervalOI.getEndOffset();
+ // TODO we'll probably have to remove 'diff == 0'
+ // after upgrading to lucene 3.1
+ if (diff == 1 || diff == 0) {
+ intervalOI.setEndOffset(localOI.getEndOffset());
+ matchSearch = false;
+ matchFound = true;
+ }
+ index++;
+ if (index >= termoffsets.length) {
+ matchSearch = false;
+ }
+ }
+ if (!matchFound) {
+ index--;
+ intervalIterator.remove();
+ }
+ }
+ }
+ termOffsetInfo.addAll(intervalTermOffsetInfo);
+ }
}
- TermVectorOffsetInfo[] offsets = list.toArray(new TermVectorOffsetInfo[list.size()]);
+ TermVectorOffsetInfo[] offsets = termOffsetInfo.toArray(new TermVectorOffsetInfo[termOffsetInfo.size()]);
// sort offsets
- if (terms.length > 1) {
+ if (offsets != null && offsets.length > 1) {
Arrays.sort(offsets, new TermVectorOffsetInfoSorter());
}
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java?rev=1173694&r1=1173693&r2=1173694&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java Wed Sep 21 14:55:18 2011
@@ -77,7 +77,7 @@ public class WeightedHighlighter extends
* highlighted
*/
public static String highlight(TermPositionVector tvec,
- Set<Term> queryTerms,
+ Set<Term[]> queryTerms,
String text,
String excerptStart,
String excerptEnd,
@@ -103,7 +103,7 @@ public class WeightedHighlighter extends
* highlighted
*/
public static String highlight(TermPositionVector tvec,
- Set<Term> queryTerms,
+ Set<Term[]> queryTerms,
String text,
int maxFragments,
int surround) throws IOException {
@@ -129,7 +129,7 @@ public class WeightedHighlighter extends
fragmentStart, fragmentEnd, surround * 2);
}
- PriorityQueue bestFragments = new FragmentInfoPriorityQueue(maxFragments);
+ PriorityQueue<FragmentInfo> bestFragments = new FragmentInfoPriorityQueue(maxFragments);
for (int i = 0; i < offsets.length; i++) {
if (offsets[i].getEndOffset() <= text.length()) {
FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2);
@@ -336,7 +336,7 @@ public class WeightedHighlighter extends
}
- private static class FragmentInfoPriorityQueue extends PriorityQueue {
+ private static class FragmentInfoPriorityQueue extends PriorityQueue<FragmentInfo> {
public FragmentInfoPriorityQueue(int size) {
initialize(size);
@@ -350,9 +350,7 @@ public class WeightedHighlighter extends
* {@link FragmentInfo} with the best quality.
*/
@Override
- protected boolean lessThan(Object a, Object b) {
- FragmentInfo infoA = (FragmentInfo) a;
- FragmentInfo infoB = (FragmentInfo) b;
+ protected boolean lessThan(FragmentInfo infoA, FragmentInfo infoB) {
if (infoA.getQuality() == infoB.getQuality()) {
return infoA.getStartOffset() > infoB.getStartOffset();
}
Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java?rev=1173694&r1=1173693&r2=1173694&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/ExcerptTest.java Wed Sep 21 14:55:18 2011
@@ -16,9 +16,9 @@
*/
package org.apache.jackrabbit.core.query;
+import javax.jcr.Node;
import javax.jcr.RepositoryException;
import javax.jcr.Value;
-import javax.jcr.Node;
import javax.jcr.query.QueryResult;
import javax.jcr.query.Row;
import javax.jcr.query.RowIterator;
@@ -104,6 +104,10 @@ public class ExcerptTest extends Abstrac
"apache jackrabbit");
}
+ /**
+ * Verifies character encoding on a node property that does not contain any
+ * excerpt info
+ */
public void testEncodeIllegalCharsNoHighlights() throws RepositoryException {
String text = "bla <strong>bla</strong> bla";
String excerpt = createExcerpt("bla <strong>bla</strong> bla");
@@ -116,23 +120,126 @@ public class ExcerptTest extends Abstrac
QueryResult result = executeQuery(stmt);
RowIterator rows = result.getRows();
assertEquals(1, rows.getSize());
- assertEquals(excerpt, rows.nextRow().getValue("rep:excerpt(text)").getString());
+ String ex = rows.nextRow().getValue("rep:excerpt(text)").getString();
+ assertEquals("Expected " + excerpt + ", but got ", excerpt, ex);
}
+ /**
+ * Verifies character encoding on a node property that contains excerpt info
+ */
public void testEncodeIllegalCharsHighlights() throws RepositoryException {
- String text = "bla <strong>bla</strong> foo";
- String excerpt = createExcerpt("bla <strong>bla</strong> <strong>foo</strong>");
+ checkExcerpt("bla <strong>bla</strong> foo",
+ "bla <strong>bla</strong> <strong>foo</strong>",
+ "foo");
+ }
+
+ /**
+ * test for https://issues.apache.org/jira/browse/JCR-3077
+ *
+ * when given a quoted phrase, the excerpt should evaluate it whole as a
+ * token (not break is down)
+ *
+ */
+ public void testQuotedPhrase() throws RepositoryException {
+ checkExcerpt("one two three four",
+ "one <strong>two three</strong> four", "\"two three\"");
+ }
+
+ /**
+ * Verifies excerpt generation on a node property that does not contain any
+ * excerpt info for a quoted phrase
+ */
+ public void testQuotedPhraseNoMatch() throws RepositoryException {
+ String text = "one two three four";
+ String excerpt = createExcerpt("one two three four");
+ String terms = "\"five six\"";
+
Node n = testRootNode.addNode(nodeName1);
n.setProperty("text", text);
+ n.setProperty("other", terms);
superuser.save();
- String stmt = getStatement("foo");
+ String stmt = getStatement(terms);
QueryResult result = executeQuery(stmt);
RowIterator rows = result.getRows();
assertEquals(1, rows.getSize());
- assertEquals(excerpt, rows.nextRow().getValue("rep:excerpt(text)").getString());
+ String ex = rows.nextRow().getValue("rep:excerpt(text)").getString();
+ assertEquals("Expected " + excerpt + ", but got ", excerpt, ex);
}
+ /**
+ *
+ * Verifies excerpt generation on a node property that contains the exact
+ * quoted phrase but with scrambled words.
+ *
+ * More clearly it actually checks that the order of tokens is respected for
+ * a quoted phrase.
+ */
+ public void testQuotedPhraseNoMatchScrambled() throws RepositoryException {
+ String text = "one two three four";
+ String excerpt = createExcerpt("one two three four");
+ String terms = "\"three two\"";
+
+ Node n = testRootNode.addNode(nodeName1);
+ n.setProperty("text", text);
+ n.setProperty("other", terms);
+ superuser.save();
+
+ String stmt = getStatement(terms);
+ QueryResult result = executeQuery(stmt);
+ RowIterator rows = result.getRows();
+ assertEquals(1, rows.getSize());
+ String ex = rows.nextRow().getValue("rep:excerpt(text)").getString();
+ assertEquals("Expected " + excerpt + ", but got ", excerpt, ex);
+ }
+
+ /**
+ * Verifies excerpt generation on a node property that does not contain the
+ * exact quoted phrase, but contains fragments of it.
+ *
+ */
+ public void testQuotedPhraseNoMatchGap() throws RepositoryException {
+ String text = "one two three four";
+ String excerpt = createExcerpt("one two three four");
+ String terms = "\"two four\"";
+
+ Node n = testRootNode.addNode(nodeName1);
+ n.setProperty("text", text);
+ n.setProperty("other", terms);
+ superuser.save();
+
+ String stmt = getStatement(terms);
+ QueryResult result = executeQuery(stmt);
+ RowIterator rows = result.getRows();
+ assertEquals(1, rows.getSize());
+ String ex = rows.nextRow().getValue("rep:excerpt(text)").getString();
+ assertEquals("Expected " + excerpt + ", but got ", excerpt, ex);
+ }
+
+ /**
+ * test for https://issues.apache.org/jira/browse/JCR-3077
+ *
+ * JA search acts as a PhraseQuery, thanks to LUCENE-2458. so it should be
+ * covered by the QuotedTest search.
+ *
+ */
+ public void testHighlightJa() throws RepositoryException {
+
+ // http://translate.google.com/#auto|en|%E3%82%B3%E3%83%B3%E3%83%86%E3%83%B3%E3%83%88
+ String jContent = "\u30b3\u30fe\u30c6\u30f3\u30c8";
+ // http://translate.google.com/#auto|en|%E3%83%86%E3%82%B9%E3%83%88
+ String jTest = "\u30c6\u30b9\u30c8";
+
+ String content = "some text with japanese: " + jContent + " (content)"
+ + " and " + jTest + " (test).";
+
+ // expected excerpt; note this may change if excerpt providers change
+ String expectedExcerpt = "some text with japanese: " + jContent
+ + " (content) and <strong>" + jTest + "</strong> (test).";
+ checkExcerpt(content, expectedExcerpt, jTest);
+ }
+
+
private void checkExcerpt(String text, String fragmentText, String terms)
throws RepositoryException {
String excerpt = createExcerpt(fragmentText);
@@ -151,7 +258,7 @@ public class ExcerptTest extends Abstrac
private void createTestData(String text) throws RepositoryException {
Node n = testRootNode.addNode(nodeName1);
n.setProperty("text", text);
- testRootNode.save();
+ superuser.save();
}
private String getExcerpt(Row row) throws RepositoryException {