You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/04/06 17:30:49 UTC

svn commit: r1465262 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/ lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/ solr/ solr/core/ so...

Author: rmuir
Date: Sat Apr  6 15:30:49 2013
New Revision: 1465262

URL: http://svn.apache.org/r1465262
Log:
SOLR-4684: add encoder config to PostingsSolrHighlighter

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
    lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java Sat Apr  6 15:30:49 2013
@@ -28,12 +28,13 @@ public class PassageFormatter {
   private final String preTag;
   private final String postTag;
   private final String ellipsis;
+  private final boolean escape;
   
   /**
    * Creates a new PassageFormatter with the default tags.
    */
   public PassageFormatter() {
-    this("<b>", "</b>", "... ");
+    this("<b>", "</b>", "... ", false);
   }
   
   /**
@@ -41,14 +42,16 @@ public class PassageFormatter {
    * @param preTag text which should appear before a highlighted term.
    * @param postTag text which should appear after a highlighted term.
    * @param ellipsis text which should be used to connect two unconnected passages.
+   * @param escape true if text should be html-escaped
    */
-  public PassageFormatter(String preTag, String postTag, String ellipsis) {
+  public PassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
     if (preTag == null || postTag == null || ellipsis == null) {
       throw new NullPointerException();
     }
     this.preTag = preTag;
     this.postTag = postTag;
     this.ellipsis = ellipsis;
+    this.escape = escape;
   }
   
   /**
@@ -74,19 +77,60 @@ public class PassageFormatter {
         int end = passage.matchEnds[i];
         // its possible to have overlapping terms
         if (start > pos) {
-          sb.append(content.substring(pos, start));
+          append(sb, content, pos, start);
         }
         if (end > pos) {
           sb.append(preTag);
-          sb.append(content.substring(Math.max(pos, start), end));
+          append(sb, content, Math.max(pos, start), end);
           sb.append(postTag);
           pos = end;
         }
       }
       // its possible a "term" from the analyzer could span a sentence boundary.
-      sb.append(content.substring(pos, Math.max(pos, passage.endOffset)));
+      append(sb, content, pos, Math.max(pos, passage.endOffset));
       pos = passage.endOffset;
     }
     return sb.toString();
   }
+
+  private void append(StringBuilder dest, String content, int start, int end) {
+    if (escape) {
+      // note: these are the rules from owasp.org
+      for (int i = start; i < end; i++) {
+        char ch = content.charAt(i);
+        switch(ch) {
+          case '&': 
+            dest.append("&amp;");
+            break;
+          case '<':
+            dest.append("&lt;");
+            break;
+          case '>':
+            dest.append("&gt;");
+            break;
+          case '"':
+            dest.append("&quot;");
+            break;
+          case '\'':
+            dest.append("&#x27;");
+            break;
+          case '/':
+            dest.append("&#x2F;");
+            break;
+          default:
+            if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
+              dest.append(ch);
+            } else if (ch < 0xff) {
+              dest.append("&#");
+              dest.append((int)ch);
+              dest.append(";");
+            } else {
+              dest.append(ch);
+            }
+        }
+      }
+    } else {
+      dest.append(content, start, end);
+    }
+  }
 }

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Sat Apr  6 15:30:49 2013
@@ -848,4 +848,40 @@ public class TestPostingsHighlighter ext
     ir.close();
     dir.close();
   }
+  
+  public void testEncode() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This is a test. Just a test highlighting from <i>postings</i>. Feel free to ignore.");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter() {
+      @Override
+      protected PassageFormatter getFormatter(String field) {
+        return new PassageFormatter("<b>", "</b>", "... ", true);
+      }
+    };
+    Query query = new TermQuery(new Term("body", "highlighting"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+    assertEquals(1, snippets.length);
+    assertEquals("Just&#32;a&#32;test&#32;<b>highlighting</b>&#32;from&#32;&lt;i&gt;postings&lt;&#x2F;i&gt;&#46;&#32;", snippets[0]);
+    
+    ir.close();
+    dir.close();
+  }
 }

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Sat Apr  6 15:30:49 2013
@@ -57,6 +57,7 @@ import org.apache.solr.util.plugin.Plugi
  *       &lt;str name="hl.tag.post"&gt;&amp;lt;/em&amp;gt;&lt;/str&gt;
  *       &lt;str name="hl.tag.ellipsis"&gt;... &lt;/str&gt;
  *       &lt;bool name="hl.defaultSummary"&gt;true&lt;/bool&gt;
+ *       &lt;str name="hl.encoder"&gt;simple&lt;/str&gt;
  *       &lt;float name="hl.score.k1"&gt;1.2&lt;/float&gt;
  *       &lt;float name="hl.score.b"&gt;0.75&lt;/float&gt;
  *       &lt;float name="hl.score.pivot"&gt;87&lt;/float&gt;
@@ -85,6 +86,7 @@ import org.apache.solr.util.plugin.Plugi
  *    <li>hl.tag.post (string) specifies text which appears after a highlighted term.
  *    <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
  *    <li>hl.defaultSummary (bool) specifies if a field should have a default summary.
+ *    <li>hl.encoder (string) can be 'html' (html escapes content) or 'simple' (no escaping).
  *    <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
  *    <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
  *    <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
@@ -143,7 +145,8 @@ public class PostingsSolrHighlighter ext
           String preTag = params.getFieldParam(fieldName, HighlightParams.TAG_PRE, "<em>");
           String postTag = params.getFieldParam(fieldName, HighlightParams.TAG_POST, "</em>");
           String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, "... ");
-          return new PassageFormatter(preTag, postTag, ellipsis);
+          String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple");
+          return new PassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder));
         }
 
         @Override

Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java?rev=1465262&r1=1465261&r2=1465262&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java Sat Apr  6 15:30:49 2013
@@ -147,4 +147,12 @@ public class TestPostingsSolrHighlighter
         req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
         "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
   }
+  
+  public void testEncoder() {
+    assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
+    assertU(commit());
+    assertQ("html escaped", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
+        "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em>&#32;one&#32;has&#32;a&#32;first&#32;&lt;i&gt;sentence&lt;&#x2F;i&gt;&#46;'");
+  }
 }