You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/01/24 16:51:57 UTC

svn commit: r1235308 [4/5] - in /lucene/dev/branches/branch_3x: lucene/ lucene/contrib/analyzers/common/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis...

Copied: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (from r1234452, lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?p2=lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java&p1=lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java&r1=1234452&r2=1235308&rev=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Tue Jan 24 15:51:55 2012
@@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.charfilter;
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -18,24 +18,23 @@ package org.apache.solr.analysis;
  */
 
 import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
-import org.apache.lucene.analysis.ReusableAnalyzerBase;
-
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
 import org.apache.lucene.analysis.Tokenizer;
-import org.junit.Ignore;
-
-import org.apache.solr.SolrTestCaseJ4;
+import org.apache.lucene.util._TestUtil;
 
 public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
 
@@ -45,9 +44,9 @@ public class HTMLStripCharFilterTest ext
     String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
             "another <a href=\"http://lucene.apache.org/\">link</a>. " +
             "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
-    String gold = " this is some text  here is a  link  and " +
-            "another  link . " +
-            "This is an entity: & plus a <.  Here is an &.  ";
+    String gold = "\nthis is some text\n here is a link and " +
+            "another link. " +
+            "This is an entity: & plus a <.  Here is an &. ";
     HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
     StringBuilder builder = new StringBuilder();
     int ch = -1;
@@ -60,13 +59,14 @@ public class HTMLStripCharFilterTest ext
               + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
       position++;
     }
-    assertEquals(gold, builder.toString());
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
   }
 
   //Some sanity checks, but not a full-fledged check
   public void testHTML() throws Exception {
-
-    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(SolrTestCaseJ4.getFile("htmlStripReaderTest.html"))));
+    InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
     StringBuilder builder = new StringBuilder();
     int ch = -1;
     while ((ch = reader.read()) != -1){
@@ -81,6 +81,24 @@ public class HTMLStripCharFilterTest ext
     
   }
 
+  public void testMSWord14GeneratedHTML() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
+    String gold = "This is a test";
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString().trim());
+  }
+  
+  
   public void testGamma() throws Exception {
     String test = "&Gamma;";
     String gold = "\u0393";
@@ -93,9 +111,7 @@ public class HTMLStripCharFilterTest ext
       builder.append((char)ch);
     }
     String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
   }
 
   public void testEntities() throws Exception {
@@ -110,9 +126,7 @@ public class HTMLStripCharFilterTest ext
       builder.append((char)ch);
     }
     String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
   }
 
   public void testMoreEntities() throws Exception {
@@ -127,9 +141,7 @@ public class HTMLStripCharFilterTest ext
       builder.append((char)ch);
     }
     String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
   }
 
   public void testReserved() throws Exception {
@@ -151,45 +163,248 @@ public class HTMLStripCharFilterTest ext
   }
 
   public void testMalformedHTML() throws Exception {
-    String test = "a <a hr<ef=aa<a>> </close</a>";
-    String gold = "a <a hr<ef=aa > </close ";
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
-    StringBuilder builder = new StringBuilder();
-    int ch = 0;
-    while ((ch = reader.read()) != -1){
-      builder.append((char)ch);
+    String[] testGold = {
+        "a <a hr<ef=aa<a>> </close</a>",
+        "a <a hr<ef=aa> </close",
+
+        "<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
+        "Submit a Site",
+
+        "<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
+        "Christian Science",
+
+        "<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
+        "\n",
+
+        // "<" before ">" inhibits tag recognition
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+
+        "<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
+        "",
+
+        "<link title=\"^\\\" 21Sta's Blog\" rel=\"search\"  type=\"application/opensearchdescription+xml\"  href=\"http://21sta.com/blog/inc/opensearch.php\" />",
+        "\n",
+
+        "<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
+        "?",
+
+        "<a href='/modern-furniture'   ' id='21txt' class='offtab'   onMouseout=\"this.className='offtab';  return true;\" onMouseover=\"this.className='ontab';  return true;\">",
+        "",
+
+        "<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
+        "",
+
+        "The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
+        "The <a href=medical\">http://www.advancedmd.com>medical practice software",
+
+        "<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
+        "Levi.com/BMX 2008 Clip of the Week 29...",
+
+        "<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
+        "Printer Friendly",
+
+        "<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
+        "Add to Favorites",
+
+        "<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
+        "At",
+
+        "E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
+        "E-mail: XXXXXX@example.com ",
+
+        "<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
+        "\nA'13?\n",
+
+        "<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
+        "\nHubert \"Geese\" Ausby\n",
+
+        "<href=\"http://anbportal.com/mms/login.asp\">",
+        "\n",
+
+        "<a href=\"",
+        "<a href=\"",
+
+        "<a href=\">",
+        "",
+
+        "<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
+        "#",
+
+        "<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
+        "",
+
+        "<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want  add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
+        "",
+
+        "<a href=#Services & Support>",
+        "",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' +  document.getElementById('advancedlink').style.display ;  document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
+        "",
+
+        "<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\"  hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
+        "",
+
+        "<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
+        "\n",
+
+        "<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
+        "#",
+
+        "<a href=  >",
+        "",
+
+        "<ahref=http:..",
+        "<ahref=http:..",
+
+        "<ahref=http:..>",
+        "\n",
+
+        "<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
+        "\nA",
+
+        "<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
+        "",
+
+        "<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
+        "",
+
+        "<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
+        "",
+
+        "<a class=\"at\" name=\"Lamborghini  href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
+        "Lamborghini /a>",
+
+        "<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
+        "",
+
+        "<a href=/myspace !style='color:#993333'>",
+        "",
+
+        "<meta name=3DProgId content=3DExcel.Sheet>",
+        "\n",
+
+        "<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
+        "\n",
+
+        "<td bgcolor=3D\"#FFFFFF\" nowrap>",
+        "\n",
+
+        "<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
+        "\"predicciones mundiales 2009\"",
+
+        "<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
+        "",
+
+        "<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
+        "Bishop\"",
+
+        "<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 &amp; 5 miles CC combined start</a>",
+        "BHAA Eircom 2 & 5 miles CC combined start",
+
+        "<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
+        "",
+
+        "<a  href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
+        "",
+
+        // "<" before ">" inhibits tag recognition
+        "<input type=\"text\" value=\"<search here>\">",
+        "<input type=\"text\" value=\"\n\">",
+
+        "<input type=\"text\" value=\"<search here\">",
+        "<input type=\"text\" value=\"\n",
+
+        "<input type=\"text\" value=\"search here>\">",
+        "\">",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
+        "",
+
+        "<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
+        "\n\n\n",
+
+        "<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
+        "\n\n\n\n\n\n\n\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
+      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+      StringBuilder builder = new StringBuilder();
+      int ch = 0;
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+      String result = builder.toString();
+      assertEquals("Test: '" + test + "'", gold, result);
     }
-    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
   }
 
+
   public void testBufferOverflow() throws Exception {
-    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
+    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
     testBuilder.append("ah<?> ??????");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
 
     testBuilder.setLength(0);
     testBuilder.append("<!--");//comments
-    appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+    appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads
 
     testBuilder.append("-->foo");
-    processBuffer(testBuilder.toString(), "Failed w/ comment");
+    String gold = "foo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
 
     testBuilder.setLength(0);
     testBuilder.append("<?");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     testBuilder.append("?>");
-    processBuffer(testBuilder.toString(), "Failed with proc. instr.");
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
     
     testBuilder.setLength(0);
     testBuilder.append("<b ");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     testBuilder.append("/>");
-    processBuffer(testBuilder.toString(), "Failed on tag");
-
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
   }
 
   private void appendChars(StringBuilder testBuilder, int numChars) {
@@ -212,13 +427,14 @@ public class HTMLStripCharFilterTest ext
     } finally {
       // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
     }
-    assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
+    assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
+        test, builder.toString());
   }
 
   public void testComment() throws Exception {
 
     String test = "<!--- three dashes, still a valid comment ---> ";
-    String gold = "  ";
+    String gold = " ";
     Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
     int ch = 0;
     StringBuilder builder = new StringBuilder();
@@ -229,7 +445,8 @@ public class HTMLStripCharFilterTest ext
     } finally {
       // System.out.println("String: " + builder.toString());
     }
-    assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
   }
 
 
@@ -251,15 +468,32 @@ public class HTMLStripCharFilterTest ext
   }
 
   public void testOffsets() throws Exception {
-    doTestOffsets("hello X how X are you");
+//    doTestOffsets("hello X how X are you");
     doTestOffsets("hello <p> X<p> how <p>X are you");
     doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
 
     // test backtracking
     doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
   }
-  
-  @Ignore("broken offsets: see LUCENE-2208")
+
+  static void assertLegalOffsets(String in) throws Exception {
+    int length = in.length();
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+    int ch = 0;
+    int off = 0;
+    while ((ch = reader.read()) != -1) {
+      int correction = reader.correctOffset(off);
+      assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
+          correction <= length);
+      off++;
+    }
+  }
+
+  public void testLegalOffsets() throws Exception {
+    assertLegalOffsets("hello world");
+    assertLegalOffsets("hello &#x world");
+  }
+
   public void testRandom() throws Exception {
     Analyzer analyzer = new ReusableAnalyzerBase() {
 
@@ -271,11 +505,361 @@ public class HTMLStripCharFilterTest ext
 
       @Override
       protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+        return new HTMLStripCharFilter(CharReader.get(reader));
       }
     };
     
     int numRounds = RANDOM_MULTIPLIER * 10000;
     checkRandomData(random, analyzer, numRounds);
   }
+  
+  public void testServerSideIncludes() throws Exception {
+    String test = "one<img src=\"image.png\"\n"
+        + " alt =  \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}'  -->\"\n\n"
+        + " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
+
+    test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
+    gold = "one\ntwo";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testScriptQuotes() throws Exception {
+    String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+
+    test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
+    gold = "hello\n";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testEscapeScript() throws Exception {
+    String test = "one<script no-value-attr>callSomeMethod();</script>two";
+    String gold = "one<script no-value-attr></script>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testStyle() throws Exception {
+    String test = "one<style type=\"text/css\">\n"
+                + "<!--\n"
+                + "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
+                + "-->\n"
+                + "</style>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testEscapeStyle() throws Exception {
+    String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
+    String gold = "one<style type=\"text/css\"></style>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testBR() throws Exception {
+    String[] testGold = {
+        "one<BR />two<br>three",
+        "one\ntwo\nthree",
+
+        "one<BR some stuff here too>two</BR>",
+        "one\ntwo\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
+      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+      StringBuilder builder = new StringBuilder();
+      int ch = 0;
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+      String result = builder.toString();
+      assertEquals("Test: '" + test + "'", gold, result);
+    }
+  }
+  public void testEscapeBR() throws Exception {
+    String test = "one<BR class='whatever'>two</\nBR\n>";
+    String gold = "one<BR class='whatever'>two</\nBR\n>";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testInlineTagsNoSpace() throws Exception {
+    String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
+    String gold = "onetwo2e.three";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testCDATA() throws Exception {
+    String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
+    String gold = "one<one><two>three<four></four></two></one>two";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+
+    test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
+    gold = "onetwo<![CDATA[three]]>fourfive";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testUppercaseCharacterEntityVariants() throws Exception {
+    String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
+    String gold = " \"-\u00A9>><<\u00AE&";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testMSWordMalformedProcessingInstruction() throws Exception {
+    String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testSupplementaryCharsInTags() throws Exception {
+    String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
+    String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testRandomBrokenHTML() throws Exception {
+    int maxNumElements = 10000;
+    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text)));
+    while (reader.read() != -1);
+  }
+
+  public void testRandomText() throws Exception {
+    StringBuilder text = new StringBuilder();
+    int minNumWords = 10;
+    int maxNumWords = 10000;
+    int minWordLength = 3;
+    int maxWordLength = 20;
+    int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+    switch (_TestUtil.nextInt(random, 0, 4)) {
+      case 0: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      case 1: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomRealisticUnicodeString
+              (random, minWordLength, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      default: { // ASCII 50% of the time
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomSimpleString(random));
+          text.append(' ');
+        }
+      }
+    }
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text.toString())));
+    while (reader.read() != -1);
+  }
+
+  public void testUTF16Surrogates() throws Exception {
+    Analyzer analyzer = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+      }
+    };
+    // Paired surrogates
+    assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
+        new String[] { "one", "two", "\uD86C\uDC01three" } );
+    assertAnalyzesTo(analyzer, " &#55404;&#XdC01;", new String[] { "\uD86C\uDC01" } );
+    assertAnalyzesTo(analyzer, " &#xD86C;&#56321;", new String[] { "\uD86C\uDC01" } );
+    assertAnalyzesTo(analyzer, " &#55404;&#56321;", new String[] { "\uD86C\uDC01" } );
+
+    // Improperly paired surrogates
+    assertAnalyzesTo(analyzer, " &#55404;&#57999;", new String[] { "\uFFFD\uE28F" } );
+    assertAnalyzesTo(analyzer, " &#xD86C;&#57999;", new String[] { "\uFFFD\uE28F" } );
+    assertAnalyzesTo(analyzer, " &#55002;&#XdC01;", new String[] { "\uD6DA\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55002;&#56321;", new String[] { "\uD6DA\uFFFD" } );
+
+    // Unpaired high surrogates
+    assertAnalyzesTo(analyzer, " &#Xd921;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#Xd921", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#Xd921<br>", new String[] { "&#Xd921" } );
+    assertAnalyzesTo(analyzer, " &#55528;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55528", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55528<br>", new String[] { "&#55528" } );
+
+    // Unpaired low surrogates
+    assertAnalyzesTo(analyzer, " &#xdfdb;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#xdfdb", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#xdfdb<br>", new String[] { "&#xdfdb" } );
+    assertAnalyzesTo(analyzer, " &#57209;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } );
+  }
 }

Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word%2014%20generated.htm?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm Tue Jan 24 15:51:55 2012
@@ -0,0 +1,653 @@
+<html xmlns:v="urn:schemas-microsoft-com:vml"
+      xmlns:o="urn:schemas-microsoft-com:office:office"
+      xmlns:w="urn:schemas-microsoft-com:office:word"
+      xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
+      xmlns="http://www.w3.org/TR/REC-html40">
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=ProgId content=Word.Document>
+<meta name=Generator content="Microsoft Word 14">
+<meta name=Originator content="Microsoft Word 14">
+<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
+<!--[if gte mso 9]><xml>
+  <o:DocumentProperties>
+    <o:Author>s</o:Author>
+    <o:LastAuthor>s</o:LastAuthor>
+    <o:Revision>1</o:Revision>
+    <o:TotalTime>1</o:TotalTime>
+    <o:Created>2012-01-13T03:36:00Z</o:Created>
+    <o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
+    <o:Pages>1</o:Pages>
+    <o:Words>8</o:Words>
+    <o:Characters>48</o:Characters>
+    <o:Lines>1</o:Lines>
+    <o:Paragraphs>1</o:Paragraphs>
+    <o:CharactersWithSpaces>55</o:CharactersWithSpaces>
+    <o:Version>14.00</o:Version>
+  </o:DocumentProperties>
+  <o:OfficeDocumentSettings>
+    <o:AllowPNG/>
+  </o:OfficeDocumentSettings>
+</xml><![endif]-->
+<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
+<link rel=colorSchemeMapping
+      href="This%20is%20a%20test_files/colorschememapping.xml">
+<!--[if gte mso 9]><xml>
+  <w:WordDocument>
+    <w:SpellingState>Clean</w:SpellingState>
+    <w:GrammarState>Clean</w:GrammarState>
+    <w:TrackMoves>false</w:TrackMoves>
+    <w:TrackFormatting/>
+    <w:PunctuationKerning/>
+    <w:ValidateAgainstSchemas/>
+    <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
+    <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
+    <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
+    <w:DoNotPromoteQF/>
+    <w:LidThemeOther>EN-US</w:LidThemeOther>
+    <w:LidThemeAsian>X-NONE</w:LidThemeAsian>
+    <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
+    <w:Compatibility>
+      <w:BreakWrappedTables/>
+      <w:SnapToGridInCell/>
+      <w:WrapTextWithPunct/>
+      <w:UseAsianBreakRules/>
+      <w:DontGrowAutofit/>
+      <w:SplitPgBreakAndParaMark/>
+      <w:EnableOpenTypeKerning/>
+      <w:DontFlipMirrorIndents/>
+      <w:OverrideTableStyleHps/>
+    </w:Compatibility>
+    <m:mathPr>
+      <m:mathFont m:val="Cambria Math"/>
+      <m:brkBin m:val="before"/>
+      <m:brkBinSub m:val="&#45;-"/>
+      <m:smallFrac m:val="off"/>
+      <m:dispDef/>
+      <m:lMargin m:val="0"/>
+      <m:rMargin m:val="0"/>
+      <m:defJc m:val="centerGroup"/>
+      <m:wrapIndent m:val="1440"/>
+      <m:intLim m:val="subSup"/>
+      <m:naryLim m:val="undOvr"/>
+    </m:mathPr></w:WordDocument>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
+                DefSemiHidden="true" DefQFormat="false" DefPriority="99"
+                LatentStyleCount="267">
+<w:LsdException Locked="false" Priority="0" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
+<w:LsdException Locked="false" Priority="9" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
+<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
+<w:LsdException Locked="false" Priority="10" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Title"/>
+<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
+<w:LsdException Locked="false" Priority="11" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
+<w:LsdException Locked="false" Priority="22" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
+<w:LsdException Locked="false" Priority="20" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
+<w:LsdException Locked="false" Priority="59" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Table Grid"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
+<w:LsdException Locked="false" Priority="1" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 1"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
+<w:LsdException Locked="false" Priority="34" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
+<w:LsdException Locked="false" Priority="29" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
+<w:LsdException Locked="false" Priority="30" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 1"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 2"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 2"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 3"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 3"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 4"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 4"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 5"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 5"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 6"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 6"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="19" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
+<w:LsdException Locked="false" Priority="21" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
+<w:LsdException Locked="false" Priority="31" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
+<w:LsdException Locked="false" Priority="32" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
+<w:LsdException Locked="false" Priority="33" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
+<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
+<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
+</w:LatentStyles>
+</xml><![endif]-->
+<style>
+<!--
+  /* Font Definitions */
+@font-face
+{font-family:"Cambria Math";
+  panose-1:2 4 5 3 5 4 6 3 2 4;
+  mso-font-charset:1;
+  mso-generic-font-family:roman;
+  mso-font-format:other;
+  mso-font-pitch:variable;
+  mso-font-signature:0 0 0 0 0 0;}
+@font-face
+{font-family:Cambria;
+  panose-1:2 4 5 3 5 4 6 3 2 4;
+  mso-font-charset:0;
+  mso-generic-font-family:roman;
+  mso-font-pitch:variable;
+  mso-font-signature:-536870145 1073743103 0 0 415 0;}
+@font-face
+{font-family:Calibri;
+  panose-1:2 15 5 2 2 2 4 3 2 4;
+  mso-font-charset:0;
+  mso-generic-font-family:swiss;
+  mso-font-pitch:variable;
+  mso-font-signature:-520092929 1073786111 9 0 415 0;}
+  /* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+{mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-parent:"";
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:10.0pt;
+  margin-left:0in;
+  line-height:115%;
+  mso-pagination:widow-orphan;
+  font-size:11.0pt;
+  font-family:"Calibri","sans-serif";
+  mso-ascii-font-family:Calibri;
+  mso-ascii-theme-font:minor-latin;
+  mso-fareast-font-family:Calibri;
+  mso-fareast-theme-font:minor-latin;
+  mso-hansi-font-family:Calibri;
+  mso-hansi-theme-font:minor-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:minor-bidi;}
+h1
+{mso-style-priority:9;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Heading 1 Char";
+  mso-style-next:Normal;
+  margin-top:24.0pt;
+  margin-right:0in;
+  margin-bottom:0in;
+  margin-left:0in;
+  margin-bottom:.0001pt;
+  line-height:115%;
+  mso-pagination:widow-orphan lines-together;
+  page-break-after:avoid;
+  mso-outline-level:1;
+  font-size:14.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#365F91;
+  mso-themecolor:accent1;
+  mso-themeshade:191;
+  mso-font-kerning:0pt;}
+p.MsoTitle, li.MsoTitle, div.MsoTitle
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:15.0pt;
+  margin-left:0in;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin:0in;
+  margin-bottom:.0001pt;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin:0in;
+  margin-bottom:.0001pt;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:15.0pt;
+  margin-left:0in;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+span.TitleChar
+{mso-style-name:"Title Char";
+  mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-locked:yes;
+  mso-style-link:Title;
+  mso-ansi-font-size:26.0pt;
+  mso-bidi-font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+span.Heading1Char
+{mso-style-name:"Heading 1 Char";
+  mso-style-priority:9;
+  mso-style-unhide:no;
+  mso-style-locked:yes;
+  mso-style-link:"Heading 1";
+  mso-ansi-font-size:14.0pt;
+  mso-bidi-font-size:14.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#365F91;
+  mso-themecolor:accent1;
+  mso-themeshade:191;
+  font-weight:bold;}
+.MsoChpDefault
+{mso-style-type:export-only;
+  mso-default-props:yes;
+  font-family:"Calibri","sans-serif";
+  mso-ascii-font-family:Calibri;
+  mso-ascii-theme-font:minor-latin;
+  mso-fareast-font-family:Calibri;
+  mso-fareast-theme-font:minor-latin;
+  mso-hansi-font-family:Calibri;
+  mso-hansi-theme-font:minor-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:minor-bidi;}
+.MsoPapDefault
+{mso-style-type:export-only;
+  margin-bottom:10.0pt;
+  line-height:115%;}
+@page WordSection1
+{size:8.5in 11.0in;
+  margin:1.0in 1.0in 1.0in 1.0in;
+  mso-header-margin:.5in;
+  mso-footer-margin:.5in;
+  mso-paper-source:0;}
+div.WordSection1
+{page:WordSection1;}
+-->
+</style>
+<!--[if gte mso 10]>
+<style>
+    /* Style Definitions */
+  table.MsoNormalTable
+  {mso-style-name:"Table Normal";
+    mso-tstyle-rowband-size:0;
+    mso-tstyle-colband-size:0;
+    mso-style-noshow:yes;
+    mso-style-priority:99;
+    mso-style-parent:"";
+    mso-padding-alt:0in 5.4pt 0in 5.4pt;
+    mso-para-margin-top:0in;
+    mso-para-margin-right:0in;
+    mso-para-margin-bottom:10.0pt;
+    mso-para-margin-left:0in;
+    line-height:115%;
+    mso-pagination:widow-orphan;
+    font-size:11.0pt;
+    font-family:"Calibri","sans-serif";
+    mso-ascii-font-family:Calibri;
+    mso-ascii-theme-font:minor-latin;
+    mso-hansi-font-family:Calibri;
+    mso-hansi-theme-font:minor-latin;
+    mso-bidi-font-family:"Times New Roman";
+    mso-bidi-theme-font:minor-bidi;}
+</style>
+<![endif]--><!--[if gte mso 9]><xml>
+  <o:shapedefaults v:ext="edit" spidmax="1026"/>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+  <o:shapelayout v:ext="edit">
+    <o:idmap v:ext="edit" data="1"/>
+  </o:shapelayout></xml><![endif]-->
+</head>
+
+<body lang=EN-US style='tab-interval:.5in'>
+
+<div class=WordSection1>
+
+  <div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
+mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
+
+    <p class=MsoTitle>This is a test</p>
+
+  </div>
+
+</div>
+
+</body>
+
+</html>
+

Modified: lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml Tue Jan 24 15:51:55 2012
@@ -103,7 +103,24 @@ are part of the ICU4C package. See http:
       </assertions>
     </java>
   </target>
-  
+
+  <property name="html.strip.charfilter.supp.macros.output.file"
+            location="../analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
+
+  <target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
+    <java
+        classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
+        dir="."
+        fork="true"
+        failonerror="true"
+        output="${html.strip.charfilter.supp.macros.output.file}">
+      <classpath>
+        <path refid="additional.dependencies"/>
+        <pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+    </java>
+  </target>
+
   <target name="compile-tools" depends="common.compile-tools">
     <compile
       srcdir="src/tools/java"

Added: lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java Tue Jan 24 15:51:55 2012
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DateFormat;
+import java.util.*;
+
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.VersionInfo;
+
+/** creates a macro to augment jflex's unicode support for > BMP */
+public class GenerateHTMLStripCharFilterSupplementaryMacros {
+  private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
+  private static final String NL = System.getProperty("line.separator");
+  private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
+      (DateFormat.FULL, DateFormat.FULL, Locale.US);
+  static {
+    DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
+  }
+
+  private static final String APACHE_LICENSE
+      = "/*" + NL
+      + " * Copyright 2010 The Apache Software Foundation." + NL
+      + " *" + NL
+      + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+      + " * you may not use this file except in compliance with the License." + NL
+      + " * You may obtain a copy of the License at" + NL
+      + " *" + NL
+      + " *      http://www.apache.org/licenses/LICENSE-2.0" + NL
+      + " *" + NL
+      + " * Unless required by applicable law or agreed to in writing, software" + NL
+      + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+      + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+      + " * See the License for the specific language governing permissions and" + NL
+      + " * limitations under the License." + NL
+      + " */" + NL + NL;
+
+
+  public static void main(String args[]) throws Exception {
+    outputHeader();
+    outputMacro("ID_Start_Supp", "[:ID_Start:]");
+    outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
+  }
+
+  static void outputHeader() {
+    System.out.print(APACHE_LICENSE);
+    System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
+    System.out.println(DATE_FORMAT.format(new Date()));
+    System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
+    System.out.print(NL + NL);
+  }
+
+  // we have to carefully output the possibilities as compact utf-16
+  // range expressions, or jflex will OOM!
+  static void outputMacro(String name, String pattern) {
+    UnicodeSet set = new UnicodeSet(pattern);
+    set.removeAll(BMP);
+    System.out.println(name + " = (");
+    // if the set is empty, we have to do this or jflex will barf
+    if (set.isEmpty()) {
+      System.out.println("\t  []");
+    }
+
+    HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      char utf16[] = Character.toChars(it.codepoint);
+      UnicodeSet trails = utf16ByLead.get(utf16[0]);
+      if (trails == null) {
+        trails = new UnicodeSet();
+        utf16ByLead.put(utf16[0], trails);
+      }
+      trails.add(utf16[1]);
+    }
+    
+    Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
+    for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
+      String trail = entry.getValue().getRegexEquivalent();
+      UnicodeSet leads = utf16ByTrail.get(trail);
+      if (leads == null) {
+        leads = new UnicodeSet();
+        utf16ByTrail.put(trail, leads);
+      }
+      leads.add(entry.getKey());
+    }
+
+    boolean isFirst = true;
+    for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
+      System.out.print( isFirst ? "\t  " : "\t| ");
+      isFirst = false;
+      System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
+    }
+    System.out.println(")");
+  }
+}

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java Tue Jan 24 15:51:55 2012
@@ -19,11 +19,46 @@ package org.apache.lucene.analysis;
 
 import org.apache.lucene.util.ArrayUtil;
 
+import java.util.Arrays;
+
 /**
- * Base utility class for implementing a {@link CharFilter}.
- * You subclass this, and then record mappings by calling
- * {@link #addOffCorrectMap}, and then invoke the correct
- * method to correct an offset.
+ * <p>
+ *   Base utility class for implementing a {@link CharFilter}.
+ *   You subclass this, and then record mappings by calling
+ *   {@link #addOffCorrectMap}, and then invoke the correct
+ *   method to correct an offset.
+ * </p>
+ + <p>
+ +   CharFilters modify an input stream via a series of substring
+ +   replacements (including deletions and insertions) to produce an output
+ +   stream. There are three possible replacement cases: the replacement
+ +   string has the same length as the original substring; the replacement
+ +   is shorter; and the replacement is longer. In the latter two cases
+ +   (when the replacement has a different length than the original),
+ +   one or more offset correction mappings are required.
+ + </p>
+ + <p>
+ +   When the replacement is shorter than the original (e.g. when the
+ +   replacement is the empty string), a single offset correction mapping
+ +   should be added at the replacement's end offset in the output stream.
+ +   The <code>cumulativeDiff</code> parameter to the
+ +   <code>addOffCorrectMapping()</code> method will be the sum of all
+ +   previous replacement offset adjustments, with the addition of the
+ +   difference between the lengths of the original substring and the
+ +   replacement string (a positive value).
+ + </p>
+ + <p>
+ +   When the replacement is longer than the original (e.g. when the
+ +   original is the empty string), you should add as many offset
+ +   correction mappings as the difference between the lengths of the
+ +   replacement string and the original substring, starting at the
+ +   end offset the original substring would have had in the output stream.
+ +   The <code>cumulativeDiff</code> parameter to the
+ +   <code>addOffCorrectMapping()</code> method will be the sum of all
+ +   previous replacement offset adjustments, with the addition of the
+ +   difference between the lengths of the original substring and the
+ +   replacement string so far (a negative value).
+ + </p>
  */
 public abstract class BaseCharFilter extends CharFilter {
 
@@ -70,6 +105,19 @@ public abstract class BaseCharFilter ext
       0 : diffs[size-1];
   }
 
+  /**
+   * <p>
+   *   Adds an offset correction mapping at the given output stream offset.
+   * </p>
+   * <p>
+   *   Assumption: the offset given with each successive call to this method
+   *   will not be smaller than the offset given at the previous invocation.
+   * </p>
+   *
+   * @param off The output stream offset at which to apply the correction
+   * @param cumulativeDiff The input offset is given by adding this
+   *                       to the output offset
+   */
   protected void addOffCorrectMap(int off, int cumulativeDiff) {
     if (offsets == null) {
       offsets = new int[64];
@@ -79,7 +127,15 @@ public abstract class BaseCharFilter ext
       diffs = ArrayUtil.grow(diffs);
     }
     
-    offsets[size] = off;
-    diffs[size++] = cumulativeDiff; 
+    assert (size == 0 || off >= offsets[size])
+        : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+          + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+    
+    if (size == 0 || off != offsets[size - 1]) {
+      offsets[size] = off;
+      diffs[size++] = cumulativeDiff;
+    } else { // Overwrite the diff at the last recorded offset
+      diffs[size - 1] = cumulativeDiff;
+    }
   }
 }

Modified: lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java Tue Jan 24 15:51:55 2012
@@ -266,7 +266,42 @@ public class _TestUtil {
     }
   }
   
-  // TODO: make this more evil
+  private static final String[] HTML_CHAR_ENTITIES = {
+      "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
+      "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
+      "Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
+      "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
+      "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
+      "Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
+      "QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
+      "Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
+      "Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
+      "alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
+      "auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
+      "cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
+      "curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
+      "eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
+      "equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
+      "frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
+      "harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
+      "image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
+      "lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
+      "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
+      "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
+      "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
+      "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
+      "ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
+      "perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
+      "psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
+      "rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
+      "sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
+      "spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
+      "szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
+      "tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
+      "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
+      "yuml", "zeta", "zwj", "zwnj"
+  };
+  
   public static String randomHtmlishString(Random random, int numElements) {
     final int end = random.nextInt(numElements);
     if (end == 0) {
@@ -275,17 +310,80 @@ public class _TestUtil {
     }
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < end; i++) {
-      int val = random.nextInt(10);
+      int val = random.nextInt(25);
       switch(val) {
         case 0: sb.append("<p>"); break;
-        case 1: sb.append("</p>"); break;
-        case 2: sb.append("<!--"); break;
-        case 3: sb.append("-->"); break;
-        case 4: sb.append("&#"); break;
-        case 5: sb.append(";"); break;
-        case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
-        default:
-          sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+        case 1: {
+          sb.append("<");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
+            sb.append(' ');
+            sb.append(randomSimpleString(random));
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append('=');
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+            sb.append(randomSimpleString(random));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+          }
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append("/".substring(nextInt(random, 0, 1)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 2: {
+          sb.append("</");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 3: sb.append(">"); break;
+        case 4: sb.append("</p>"); break;
+        case 5: sb.append("<!--"); break;
+        case 6: sb.append("<!--#"); break;
+        case 7: sb.append("<script><!-- f('"); break;
+        case 8: sb.append("</script>"); break;
+        case 9: sb.append("<?"); break;
+        case 10: sb.append("?>"); break;
+        case 11: sb.append("\""); break;
+        case 12: sb.append("\\\""); break;
+        case 13: sb.append("'"); break;
+        case 14: sb.append("\\'"); break;
+        case 15: sb.append("-->"); break;
+        case 16: {
+          sb.append("&");
+          switch(nextInt(random, 0, 2)) {
+            case 0: sb.append(randomSimpleString(random)); break;
+            case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
+          }
+          sb.append(";".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 17: {
+          sb.append("&#");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        } 
+        case 18: {
+          sb.append("&#x");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        }
+          
+        case 19: sb.append(";"); break;
+        case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
+        case 21: sb.append("\n");
+        case 22: sb.append("          ".substring(nextInt(random, 0, 10)));
+        default: sb.append(randomSimpleString(random));
       }
     }
     return sb.toString();

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Tue Jan 24 15:51:55 2012
@@ -28,6 +28,14 @@ Upgrading from Solr 3.5
 * As doGet() methods in SimplePostTool was changed to static, the client applications of this
   class need to be recompiled.
 
+* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
+  character offsets it provided, triggering e.g. exceptions in highlighting.
+  HTMLStripCharFilter has been re-implemented, addressing this and other
+  issues.  See the entry for LUCENE-3690 in the Bug Fixes section below for a
+  detailed list of changes.  For people who depend on the behavior of
+  HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
+  (bugs and all) is preserved as LegacyHTMLStripCharFilter.
+
 New Features
 ----------------------
 * SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
@@ -119,6 +127,47 @@ Bug Fixes
 
 * SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)
 
+* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
+  HTMLStripCharFilter as a JFlex-generated scanner and moved it to
+  lucene/contrib/analyzers/common/.  See below for a list of bug fixes and
+  other changes.  To get the same behavior as HTMLStripCharFilter in Solr
+  version 3.5 and earlier (including the bugs), use LegacyHTMLStripCharFilter,
+  which is the previous implementation.
+
+  Behavior changes from the previous version:
+
+  - Known offset bugs are fixed.
+  - The "Mark invalid" exceptions reported in SOLR-1283 are no longer
+    triggered (the bug is still present in LegacyHTMLStripCharFilter).
+  - The character entity "&apos;" is now always properly decoded.
+  - More cases of <script> tags are now properly stripped.
+  - CDATA sections are now handled properly.
+  - Valid tag name characters now include the supplementary Unicode characters
+    from Unicode character classes [:ID_Start:] and [:ID_Continue:].
+  - Uppercase character entities "&QUOT;", "&COPY;", "&GT;", "&LT;", "&REG;",
+    and "&AMP;" are now recognized and handled as if they were in lowercase.
+  - The REPLACEMENT CHARACTER U+FFFD is now used to replace numeric character
+    entities for unpaired UTF-16 low and high surrogates (in the range
+    [U+D800-U+DFFF]).
+  - Properly paired numeric character entities for UTF-16 surrogates are now
+    converted to the corresponding code units.
+  - Opening tags with unbalanced quotation marks are now properly stripped.
+  - Literal "<" and ">" characters in opening tags, regardless of whether they
+    appear inside quotation marks, now inhibit recognition (and stripping) of
+    the tags.  The only exception to this is for values of event-handler
+    attributes, e.g. "onClick", "onLoad", "onSelect".
+  - A newline '\n' is substituted instead of a space for stripped HTML markup.
+  - Nothing is substituted for opening and closing inline tags - they are
+    simply removed.  The list of inline tags is (case insensitively): <a>,
+    <abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
+    <em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
+    <select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
+    <tt>, <u>, and <var>.
+  - HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
+    feature: opening and closing tags with the given names, including any
+    attributes and their values, are left intact in the output.
+  (Steve Rowe)
+
 * LUCENE-3717: Fixed offset bugs in TrimFilter, WordDelimiterFilter, and
   HyphenatedWordsFilter where they would create invalid offsets in
   some situations, leading to problems in highlighting.  (Robert Muir)