You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/07/02 16:17:26 UTC
svn commit: r1498949 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/highlighter/
lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/
lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/
solr/ solr/core/ so...
Author: rmuir
Date: Tue Jul 2 14:17:26 2013
New Revision: 1498949
URL: http://svn.apache.org/r1498949
Log:
LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1498949&r1=1498948&r2=1498949&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Jul 2 14:17:26 2013
@@ -230,6 +230,11 @@ New Features
* SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter
to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
+* LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter, for cases
+ where you want a different logical separator between field values. This can
+ be set to e.g. U+2029 PARAGRAPH SEPARATOR if you never want passes to span
+ values. (Mike McCandless, Robert Muir)
+
API Changes
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes
Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1498949&r1=1498948&r2=1498949&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Tue Jul 2 14:17:26 2013
@@ -369,7 +369,11 @@ public class PostingsHighlighter {
* identical to what was indexed. */
protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
String contents[][] = new String[fields.length][docids.length];
- LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
+ char valueSeparators[] = new char[fields.length];
+ for (int i = 0; i < fields.length; i++) {
+ valueSeparators[i] = getMultiValuedSeparator(fields[i]);
+ }
+ LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
for (int i = 0; i < docids.length; i++) {
searcher.doc(docids[i], visitor);
for (int j = 0; j < fields.length; j++) {
@@ -379,6 +383,16 @@ public class PostingsHighlighter {
}
return contents;
}
+
+ /**
+ * Returns the logical separator between values for multi-valued fields.
+ * The default value is a space character, which means passages can span across values,
+ * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
+ * if each value holds a discrete passage for highlighting.
+ */
+ protected char getMultiValuedSeparator(String field) {
+ return ' ';
+ }
private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,String> highlights = new HashMap<Integer,String>();
@@ -652,12 +666,15 @@ public class PostingsHighlighter {
private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
private final String fields[];
+ private final char valueSeparators[];
private final int maxLength;
private final StringBuilder builders[];
private int currentField = -1;
- public LimitedStoredFieldVisitor(String fields[], int maxLength) {
+ public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
+ assert fields.length == valueSeparators.length;
this.fields = fields;
+ this.valueSeparators = valueSeparators;
this.maxLength = maxLength;
builders = new StringBuilder[fields.length];
for (int i = 0; i < builders.length; i++) {
@@ -670,7 +687,7 @@ public class PostingsHighlighter {
assert currentField >= 0;
StringBuilder builder = builders[currentField];
if (builder.length() > 0 && builder.length() < maxLength) {
- builder.append(' '); // for the offset gap, TODO: make this configurable
+ builder.append(valueSeparators[currentField]);
}
if (builder.length() + value.length() > maxLength) {
builder.append(value, 0, maxLength - builder.length());
Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1498949&r1=1498948&r2=1498949&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Tue Jul 2 14:17:26 2013
@@ -921,4 +921,48 @@ public class TestPostingsHighlighter ext
ir.close();
dir.close();
}
+
+ /** customizing the gap separator to force a sentence break */
+ public void testGapSeparator() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Document doc = new Document();
+
+ Field body1 = new Field("body", "", offsetsType);
+ body1.setStringValue("This is a multivalued field");
+ doc.add(body1);
+
+ Field body2 = new Field("body", "", offsetsType);
+ body2.setStringValue("This is something different");
+ doc.add(body2);
+
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected char getMultiValuedSeparator(String field) {
+ assert field.equals("body");
+ return '\u2029';
+ }
+ };
+ Query query = new TermQuery(new Term("body", "field"));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(1, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(1, snippets.length);
+ assertEquals("This is a multivalued <b>field</b>\u2029", snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
}
Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1498949&r1=1498948&r2=1498949&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Tue Jul 2 14:17:26 2013
@@ -67,6 +67,7 @@ import org.apache.solr.util.plugin.Plugi
* <str name="hl.bs.variant"></str>
* <str name="hl.bs.type">SENTENCE</str>
* <int name="hl.maxAnalyzedChars">10000</int>
+ * <str name="hl.multiValuedSeparatorChar"> </str>
* </lst>
* </requestHandler>
* </pre>
@@ -96,6 +97,7 @@ import org.apache.solr.util.plugin.Plugi
* <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
* <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
* <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
+ * <li>hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields.
* NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
* </ul>
*
@@ -167,6 +169,15 @@ public class PostingsSolrHighlighter ext
String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
return parseBreakIterator(type, locale);
}
+
+ @Override
+ protected char getMultiValuedSeparator(String field) {
+ String sep = params.getFieldParam(field, HighlightParams.MULTI_VALUED_SEPARATOR, " ");
+ if (sep.length() != 1) {
+ throw new IllegalArgumentException(HighlightParams.MULTI_VALUED_SEPARATOR + " must be exactly one character.");
+ }
+ return sep.charAt(0);
+ }
};
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);