You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/04/06 17:30:49 UTC
svn commit: r1465262 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/highlighter/
lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/
lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/
solr/ solr/core/ so...
Author: rmuir
Date: Sat Apr 6 15:30:49 2013
New Revision: 1465262
URL: http://svn.apache.org/r1465262
Log:
SOLR-4684: add encoder config to PostingsSolrHighlighter
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java Sat Apr 6 15:30:49 2013
@@ -28,12 +28,13 @@ public class PassageFormatter {
private final String preTag;
private final String postTag;
private final String ellipsis;
+ private final boolean escape;
/**
* Creates a new PassageFormatter with the default tags.
*/
public PassageFormatter() {
- this("<b>", "</b>", "... ");
+ this("<b>", "</b>", "... ", false);
}
/**
@@ -41,14 +42,16 @@ public class PassageFormatter {
* @param preTag text which should appear before a highlighted term.
* @param postTag text which should appear after a highlighted term.
* @param ellipsis text which should be used to connect two unconnected passages.
+ * @param escape true if text should be html-escaped
*/
- public PassageFormatter(String preTag, String postTag, String ellipsis) {
+ public PassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
if (preTag == null || postTag == null || ellipsis == null) {
throw new NullPointerException();
}
this.preTag = preTag;
this.postTag = postTag;
this.ellipsis = ellipsis;
+ this.escape = escape;
}
/**
@@ -74,19 +77,60 @@ public class PassageFormatter {
int end = passage.matchEnds[i];
// its possible to have overlapping terms
if (start > pos) {
- sb.append(content.substring(pos, start));
+ append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
- sb.append(content.substring(Math.max(pos, start), end));
+ append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
- sb.append(content.substring(pos, Math.max(pos, passage.endOffset)));
+ append(sb, content, pos, Math.max(pos, passage.endOffset));
pos = passage.endOffset;
}
return sb.toString();
}
+
+ private void append(StringBuilder dest, String content, int start, int end) {
+ if (escape) {
+ // note: these are the rules from owasp.org
+ for (int i = start; i < end; i++) {
+ char ch = content.charAt(i);
+ switch(ch) {
+ case '&':
+ dest.append("&");
+ break;
+ case '<':
+ dest.append("<");
+ break;
+ case '>':
+ dest.append(">");
+ break;
+ case '"':
+ dest.append(""");
+ break;
+ case '\'':
+ dest.append("'");
+ break;
+ case '/':
+ dest.append("/");
+ break;
+ default:
+ if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
+ dest.append(ch);
+ } else if (ch < 0xff) {
+ dest.append("&#");
+ dest.append((int)ch);
+ dest.append(";");
+ } else {
+ dest.append(ch);
+ }
+ }
+ }
+ } else {
+ dest.append(content, start, end);
+ }
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Sat Apr 6 15:30:49 2013
@@ -848,4 +848,40 @@ public class TestPostingsHighlighter ext
ir.close();
dir.close();
}
+
+ public void testEncode() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test. Just a test highlighting from <i>postings</i>. Feel free to ignore.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected PassageFormatter getFormatter(String field) {
+ return new PassageFormatter("<b>", "</b>", "... ", true);
+ }
+ };
+ Query query = new TermQuery(new Term("body", "highlighting"));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(1, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(1, snippets.length);
+ assertEquals("Just a test <b>highlighting</b> from <i>postings</i>. ", snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
}
Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Sat Apr 6 15:30:49 2013
@@ -57,6 +57,7 @@ import org.apache.solr.util.plugin.Plugi
* <str name="hl.tag.post">&lt;/em&gt;</str>
* <str name="hl.tag.ellipsis">... </str>
* <bool name="hl.defaultSummary">true</bool>
+ * <str name="hl.encoder">simple</str>
* <float name="hl.score.k1">1.2</float>
* <float name="hl.score.b">0.75</float>
* <float name="hl.score.pivot">87</float>
@@ -85,6 +86,7 @@ import org.apache.solr.util.plugin.Plugi
* <li>hl.tag.post (string) specifies text which appears after a highlighted term.
* <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
* <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
+ * <li>hl.encoder (string) can be 'html' (html escapes content) or 'simple' (no escaping).
* <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
* <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
* <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
@@ -143,7 +145,8 @@ public class PostingsSolrHighlighter ext
String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
- return new PassageFormatter(preTag, postTag, ellipsis);
+ String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
+ return new PassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
}
@Override
Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java Sat Apr 6 15:30:49 2013
@@ -147,4 +147,12 @@ public class TestPostingsSolrHighlighter
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
}
+
+ public void testEncoder() {
+ assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
+ assertU(commit());
+ assertQ("html escaped",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
+ "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first <i>sentence</i>.'");
+ }
}