You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/01/24 16:51:57 UTC
svn commit: r1235308 [4/5] - in /lucene/dev/branches/branch_3x: lucene/
lucene/contrib/analyzers/common/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/
lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis...
Copied: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (from r1234452, lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?p2=lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java&p1=lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java&r1=1234452&r2=1235308&rev=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Tue Jan 24 15:51:55 2012
@@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.charfilter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -18,24 +18,23 @@ package org.apache.solr.analysis;
*/
import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
-import org.apache.lucene.analysis.ReusableAnalyzerBase;
-
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
-import org.junit.Ignore;
-
-import org.apache.solr.SolrTestCaseJ4;
+import org.apache.lucene.util._TestUtil;
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
@@ -45,9 +44,9 @@ public class HTMLStripCharFilterTest ext
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
"This is an entity: & plus a <. Here is an &. <!-- is a comment -->";
- String gold = " this is some text here is a link and " +
- "another link . " +
- "This is an entity: & plus a <. Here is an &. ";
+ String gold = "\nthis is some text\n here is a link and " +
+ "another link. " +
+ "This is an entity: & plus a <. Here is an &. ";
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
StringBuilder builder = new StringBuilder();
int ch = -1;
@@ -60,13 +59,14 @@ public class HTMLStripCharFilterTest ext
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
position++;
}
- assertEquals(gold, builder.toString());
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
}
//Some sanity checks, but not a full-fledged check
public void testHTML() throws Exception {
-
- HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(SolrTestCaseJ4.getFile("htmlStripReaderTest.html"))));
+ InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
+ HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
StringBuilder builder = new StringBuilder();
int ch = -1;
while ((ch = reader.read()) != -1){
@@ -81,6 +81,24 @@ public class HTMLStripCharFilterTest ext
}
+ public void testMSWord14GeneratedHTML() throws Exception {
+ InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
+ HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
+ String gold = "This is a test";
+ StringBuilder builder = new StringBuilder();
+ int ch = 0;
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
+ gold, builder.toString().trim());
+ }
+
+
public void testGamma() throws Exception {
String test = "Γ";
String gold = "\u0393";
@@ -93,9 +111,7 @@ public class HTMLStripCharFilterTest ext
builder.append((char)ch);
}
String result = builder.toString();
- // System.out.println("Resu: " + result + "<EOL>");
- // System.out.println("Gold: " + gold + "<EOL>");
- assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+ assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testEntities() throws Exception {
@@ -110,9 +126,7 @@ public class HTMLStripCharFilterTest ext
builder.append((char)ch);
}
String result = builder.toString();
- // System.out.println("Resu: " + result + "<EOL>");
- // System.out.println("Gold: " + gold + "<EOL>");
- assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+ assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testMoreEntities() throws Exception {
@@ -127,9 +141,7 @@ public class HTMLStripCharFilterTest ext
builder.append((char)ch);
}
String result = builder.toString();
- // System.out.println("Resu: " + result + "<EOL>");
- // System.out.println("Gold: " + gold + "<EOL>");
- assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
+ assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testReserved() throws Exception {
@@ -151,45 +163,248 @@ public class HTMLStripCharFilterTest ext
}
public void testMalformedHTML() throws Exception {
- String test = "a <a hr<ef=aa<a>> </close</a>";
- String gold = "a <a hr<ef=aa > </close ";
- Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
- StringBuilder builder = new StringBuilder();
- int ch = 0;
- while ((ch = reader.read()) != -1){
- builder.append((char)ch);
+ String[] testGold = {
+ "a <a hr<ef=aa<a>> </close</a>",
+ "a <a hr<ef=aa> </close",
+
+ "<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
+ "Submit a Site",
+
+ "<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
+ "Christian Science",
+
+ "<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
+ "\n",
+
+ // "<" before ">" inhibits tag recognition
+ "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+ "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+
+ "<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
+ "",
+
+ "<link title=\"^\\\" 21Sta's Blog\" rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"http://21sta.com/blog/inc/opensearch.php\" />",
+ "\n",
+
+ "<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
+ "?",
+
+ "<a href='/modern-furniture' ' id='21txt' class='offtab' onMouseout=\"this.className='offtab'; return true;\" onMouseover=\"this.className='ontab'; return true;\">",
+ "",
+
+ "<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
+ "",
+
+ "The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
+ "The <a href=medical\">http://www.advancedmd.com>medical practice software",
+
+ "<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
+ "Levi.com/BMX 2008 Clip of the Week 29...",
+
+ "<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
+ "Printer Friendly",
+
+ "<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
+ "Add to Favorites",
+
+ "<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
+ "At",
+
+ "E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
+ "E-mail: XXXXXX@example.com ",
+
+ "<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
+ "\nA'13?\n",
+
+ "<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
+ "\nHubert \"Geese\" Ausby\n",
+
+ "<href=\"http://anbportal.com/mms/login.asp\">",
+ "\n",
+
+ "<a href=\"",
+ "<a href=\"",
+
+ "<a href=\">",
+ "",
+
+ "<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
+ "#",
+
+ "<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
+ "",
+
+ "<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
+ "",
+
+ "<a href=#Services & Support>",
+ "",
+
+ // "<" and ">" chars are accepted in on[Event] attribute values
+ "<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' + document.getElementById('advancedlink').style.display ; document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
+ "",
+
+ "<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\" hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
+ "",
+
+ "<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
+ "\n",
+
+ "<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
+ "#",
+
+ "<a href= >",
+ "",
+
+ "<ahref=http:..",
+ "<ahref=http:..",
+
+ "<ahref=http:..>",
+ "\n",
+
+ "<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
+ "\nA",
+
+ "<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
+ "",
+
+ "<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
+ "",
+
+ "<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
+ "",
+
+ "<a class=\"at\" name=\"Lamborghini href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
+ "Lamborghini /a>",
+
+ "<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
+ "",
+
+ "<a href=/myspace !style='color:#993333'>",
+ "",
+
+ "<meta name=3DProgId content=3DExcel.Sheet>",
+ "\n",
+
+ "<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
+ "\n",
+
+ "<td bgcolor=3D\"#FFFFFF\" nowrap>",
+ "\n",
+
+ "<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
+ "\"predicciones mundiales 2009\"",
+
+ "<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
+ "",
+
+ "<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
+ "Bishop\"",
+
+ "<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 & 5 miles CC combined start</a>",
+ "BHAA Eircom 2 & 5 miles CC combined start",
+
+ "<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
+ "",
+
+ "<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
+ "",
+
+ // "<" before ">" inhibits tag recognition
+ "<input type=\"text\" value=\"<search here>\">",
+ "<input type=\"text\" value=\"\n\">",
+
+ "<input type=\"text\" value=\"<search here\">",
+ "<input type=\"text\" value=\"\n",
+
+ "<input type=\"text\" value=\"search here>\">",
+ "\">",
+
+ // "<" and ">" chars are accepted in on[Event] attribute values
+ "<input type=\"text\" value=\"<search here>\" onFocus=\"this.value='<search here>'\">",
+ "",
+
+ "<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
+ "\n\n\n",
+
+ "<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
+ "\n\n\n\n\n\n\n\n",
+ };
+ for (int i = 0 ; i < testGold.length ; i += 2) {
+ String test = testGold[i];
+ String gold = testGold[i + 1];
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ StringBuilder builder = new StringBuilder();
+ int ch = 0;
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ String result = builder.toString();
+ assertEquals("Test: '" + test + "'", gold, result);
}
- String result = builder.toString();
- // System.out.println("Resu: " + result + "<EOL>");
- // System.out.println("Gold: " + gold + "<EOL>");
- assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
}
+
public void testBufferOverflow() throws Exception {
- StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
+ StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
testBuilder.append("ah<?> ??????");
- appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+ appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
testBuilder.setLength(0);
testBuilder.append("<!--");//comments
- appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+ appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads
testBuilder.append("-->foo");
- processBuffer(testBuilder.toString(), "Failed w/ comment");
+ String gold = "foo";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
testBuilder.setLength(0);
testBuilder.append("<?");
- appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+ appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("?>");
- processBuffer(testBuilder.toString(), "Failed with proc. instr.");
+ gold = "";
+ reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+ ch = 0;
+ builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
testBuilder.setLength(0);
testBuilder.append("<b ");
- appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+ appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("/>");
- processBuffer(testBuilder.toString(), "Failed on tag");
-
+ gold = "";
+ reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+ ch = 0;
+ builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
}
private void appendChars(StringBuilder testBuilder, int numChars) {
@@ -212,13 +427,14 @@ public class HTMLStripCharFilterTest ext
} finally {
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
}
- assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
+ assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
+ test, builder.toString());
}
public void testComment() throws Exception {
String test = "<!--- three dashes, still a valid comment ---> ";
- String gold = " ";
+ String gold = " ";
Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
@@ -229,7 +445,8 @@ public class HTMLStripCharFilterTest ext
} finally {
// System.out.println("String: " + builder.toString());
}
- assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
}
@@ -251,15 +468,32 @@ public class HTMLStripCharFilterTest ext
}
public void testOffsets() throws Exception {
- doTestOffsets("hello X how X are you");
+// doTestOffsets("hello X how X are you");
doTestOffsets("hello <p> X<p> how <p>X are you");
doTestOffsets("X & X ( X < > X");
// test backtracking
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
}
-
- @Ignore("broken offsets: see LUCENE-2208")
+
+ static void assertLegalOffsets(String in) throws Exception {
+ int length = in.length();
+ HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+ int ch = 0;
+ int off = 0;
+ while ((ch = reader.read()) != -1) {
+ int correction = reader.correctOffset(off);
+ assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
+ correction <= length);
+ off++;
+ }
+ }
+
+ public void testLegalOffsets() throws Exception {
+ assertLegalOffsets("hello world");
+ assertLegalOffsets("hello &#x world");
+ }
+
public void testRandom() throws Exception {
Analyzer analyzer = new ReusableAnalyzerBase() {
@@ -271,11 +505,361 @@ public class HTMLStripCharFilterTest ext
@Override
protected Reader initReader(Reader reader) {
- return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+ return new HTMLStripCharFilter(CharReader.get(reader));
}
};
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
+
+ public void testServerSideIncludes() throws Exception {
+ String test = "one<img src=\"image.png\"\n"
+ + " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
+ + " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
+ String gold = "onetwo";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
+
+ test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
+ gold = "one\ntwo";
+ reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ ch = 0;
+ builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testScriptQuotes() throws Exception {
+ String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
+ String gold = "one\ntwo";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+
+ test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
+ gold = "hello\n";
+ reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ ch = 0;
+ builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testEscapeScript() throws Exception {
+ String test = "one<script no-value-attr>callSomeMethod();</script>two";
+ String gold = "one<script no-value-attr></script>two";
+ Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
+ Reader reader = new HTMLStripCharFilter
+ (CharReader.get(new StringReader(test)), escapedTags);
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testStyle() throws Exception {
+ String test = "one<style type=\"text/css\">\n"
+ + "<!--\n"
+ + "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
+ + "-->\n"
+ + "</style>two";
+ String gold = "one\ntwo";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testEscapeStyle() throws Exception {
+ String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
+ String gold = "one<style type=\"text/css\"></style>two";
+ Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
+ Reader reader = new HTMLStripCharFilter
+ (CharReader.get(new StringReader(test)), escapedTags);
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testBR() throws Exception {
+ String[] testGold = {
+ "one<BR />two<br>three",
+ "one\ntwo\nthree",
+
+ "one<BR some stuff here too>two</BR>",
+ "one\ntwo\n",
+ };
+ for (int i = 0 ; i < testGold.length ; i += 2) {
+ String test = testGold[i];
+ String gold = testGold[i + 1];
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ StringBuilder builder = new StringBuilder();
+ int ch = 0;
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ String result = builder.toString();
+ assertEquals("Test: '" + test + "'", gold, result);
+ }
+ }
+ public void testEscapeBR() throws Exception {
+ String test = "one<BR class='whatever'>two</\nBR\n>";
+ String gold = "one<BR class='whatever'>two</\nBR\n>";
+ Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
+ Reader reader = new HTMLStripCharFilter
+ (CharReader.get(new StringReader(test)), escapedTags);
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testInlineTagsNoSpace() throws Exception {
+ String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
+ String gold = "onetwo2e.three";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testCDATA() throws Exception {
+ String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
+ String gold = "one<one><two>three<four></four></two></one>two";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+
+ test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
+ gold = "onetwo<![CDATA[three]]>fourfive";
+ reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ ch = 0;
+ builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testUppercaseCharacterEntityVariants() throws Exception {
+ String test = " "-©>><<®&";
+ String gold = " \"-\u00A9>><<\u00AE&";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testMSWordMalformedProcessingInstruction() throws Exception {
+ String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
+ String gold = "onetwo";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testSupplementaryCharsInTags() throws Exception {
+ String test = "one<ð©¬
è±éä¹æ¯ç>two<çæ¯ð©¬
>three çæ¯ð©¬
</çæ¯ð©¬
>four</ð©¬
è±éä¹æ¯ç>five<ð ð >six<ð ð />seven";
+ String gold = "one\ntwo\nthree çæ¯ð©¬
\nfour\nfive\nsix\nseven";
+ Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ int ch = 0;
+ StringBuilder builder = new StringBuilder();
+ try {
+ while ((ch = reader.read()) != -1){
+ builder.append((char)ch);
+ }
+ } finally {
+ // System.out.println("String: " + builder.toString());
+ }
+ assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+ gold, builder.toString());
+ }
+
+ public void testRandomBrokenHTML() throws Exception {
+ int maxNumElements = 10000;
+ String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+ Reader reader = new HTMLStripCharFilter
+ (CharReader.get(new StringReader(text)));
+ while (reader.read() != -1);
+ }
+
+ public void testRandomText() throws Exception {
+ StringBuilder text = new StringBuilder();
+ int minNumWords = 10;
+ int maxNumWords = 10000;
+ int minWordLength = 3;
+ int maxWordLength = 20;
+ int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+ switch (_TestUtil.nextInt(random, 0, 4)) {
+ case 0: {
+ for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+ text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+ text.append(' ');
+ }
+ break;
+ }
+ case 1: {
+ for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+ text.append(_TestUtil.randomRealisticUnicodeString
+ (random, minWordLength, maxWordLength));
+ text.append(' ');
+ }
+ break;
+ }
+ default: { // ASCII 50% of the time
+ for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+ text.append(_TestUtil.randomSimpleString(random));
+ text.append(' ');
+ }
+ }
+ }
+ Reader reader = new HTMLStripCharFilter
+ (CharReader.get(new StringReader(text.toString())));
+ while (reader.read() != -1);
+ }
+
+ public void testUTF16Surrogates() throws Exception {
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+ }
+ };
+ // Paired surrogates
+ assertAnalyzesTo(analyzer, " one two ��three",
+ new String[] { "one", "two", "\uD86C\uDC01three" } );
+ assertAnalyzesTo(analyzer, " ��", new String[] { "\uD86C\uDC01" } );
+ assertAnalyzesTo(analyzer, " ��", new String[] { "\uD86C\uDC01" } );
+ assertAnalyzesTo(analyzer, " ��", new String[] { "\uD86C\uDC01" } );
+
+ // Improperly paired surrogates
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD\uE28F" } );
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD\uE28F" } );
+ assertAnalyzesTo(analyzer, " 훚�", new String[] { "\uD6DA\uFFFD" } );
+ assertAnalyzesTo(analyzer, " 훚�", new String[] { "\uD6DA\uFFFD" } );
+
+ // Unpaired high surrogates
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
+
+ // Unpaired low surrogates
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �", new String[] { "\uFFFD" } );
+ assertAnalyzesTo(analyzer, " �<br>", new String[] { "�" } );
+ }
}
Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word%2014%20generated.htm?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm Tue Jan 24 15:51:55 2012
@@ -0,0 +1,653 @@
+<html xmlns:v="urn:schemas-microsoft-com:vml"
+ xmlns:o="urn:schemas-microsoft-com:office:office"
+ xmlns:w="urn:schemas-microsoft-com:office:word"
+ xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
+ xmlns="http://www.w3.org/TR/REC-html40">
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=ProgId content=Word.Document>
+<meta name=Generator content="Microsoft Word 14">
+<meta name=Originator content="Microsoft Word 14">
+<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
+<!--[if gte mso 9]><xml>
+ <o:DocumentProperties>
+ <o:Author>s</o:Author>
+ <o:LastAuthor>s</o:LastAuthor>
+ <o:Revision>1</o:Revision>
+ <o:TotalTime>1</o:TotalTime>
+ <o:Created>2012-01-13T03:36:00Z</o:Created>
+ <o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
+ <o:Pages>1</o:Pages>
+ <o:Words>8</o:Words>
+ <o:Characters>48</o:Characters>
+ <o:Lines>1</o:Lines>
+ <o:Paragraphs>1</o:Paragraphs>
+ <o:CharactersWithSpaces>55</o:CharactersWithSpaces>
+ <o:Version>14.00</o:Version>
+ </o:DocumentProperties>
+ <o:OfficeDocumentSettings>
+ <o:AllowPNG/>
+ </o:OfficeDocumentSettings>
+</xml><![endif]-->
+<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
+<link rel=colorSchemeMapping
+ href="This%20is%20a%20test_files/colorschememapping.xml">
+<!--[if gte mso 9]><xml>
+ <w:WordDocument>
+ <w:SpellingState>Clean</w:SpellingState>
+ <w:GrammarState>Clean</w:GrammarState>
+ <w:TrackMoves>false</w:TrackMoves>
+ <w:TrackFormatting/>
+ <w:PunctuationKerning/>
+ <w:ValidateAgainstSchemas/>
+ <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
+ <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
+ <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
+ <w:DoNotPromoteQF/>
+ <w:LidThemeOther>EN-US</w:LidThemeOther>
+ <w:LidThemeAsian>X-NONE</w:LidThemeAsian>
+ <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
+ <w:Compatibility>
+ <w:BreakWrappedTables/>
+ <w:SnapToGridInCell/>
+ <w:WrapTextWithPunct/>
+ <w:UseAsianBreakRules/>
+ <w:DontGrowAutofit/>
+ <w:SplitPgBreakAndParaMark/>
+ <w:EnableOpenTypeKerning/>
+ <w:DontFlipMirrorIndents/>
+ <w:OverrideTableStyleHps/>
+ </w:Compatibility>
+ <m:mathPr>
+ <m:mathFont m:val="Cambria Math"/>
+ <m:brkBin m:val="before"/>
+ <m:brkBinSub m:val="--"/>
+ <m:smallFrac m:val="off"/>
+ <m:dispDef/>
+ <m:lMargin m:val="0"/>
+ <m:rMargin m:val="0"/>
+ <m:defJc m:val="centerGroup"/>
+ <m:wrapIndent m:val="1440"/>
+ <m:intLim m:val="subSup"/>
+ <m:naryLim m:val="undOvr"/>
+ </m:mathPr></w:WordDocument>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
+ DefSemiHidden="true" DefQFormat="false" DefPriority="99"
+ LatentStyleCount="267">
+<w:LsdException Locked="false" Priority="0" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
+<w:LsdException Locked="false" Priority="9" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
+<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
+<w:LsdException Locked="false" Priority="10" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Title"/>
+<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
+<w:LsdException Locked="false" Priority="11" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
+<w:LsdException Locked="false" Priority="22" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
+<w:LsdException Locked="false" Priority="20" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
+<w:LsdException Locked="false" Priority="59" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Table Grid"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
+<w:LsdException Locked="false" Priority="1" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Shading"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light List"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Grid"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 1"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Dark List"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Shading"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful List"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Grid"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light List Accent 1"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
+<w:LsdException Locked="false" Priority="34" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
+<w:LsdException Locked="false" Priority="29" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
+<w:LsdException Locked="false" Priority="30" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Dark List Accent 1"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light List Accent 2"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Dark List Accent 2"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light List Accent 3"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Dark List Accent 3"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light List Accent 4"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Dark List Accent 4"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light List Accent 5"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Dark List Accent 5"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light List Accent 6"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Dark List Accent 6"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+ UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="19" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
+<w:LsdException Locked="false" Priority="21" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
+<w:LsdException Locked="false" Priority="31" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
+<w:LsdException Locked="false" Priority="32" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
+<w:LsdException Locked="false" Priority="33" SemiHidden="false"
+ UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
+<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
+<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
+</w:LatentStyles>
+</xml><![endif]-->
+<style>
+<!--
+ /* Font Definitions */
+@font-face
+{font-family:"Cambria Math";
+ panose-1:2 4 5 3 5 4 6 3 2 4;
+ mso-font-charset:1;
+ mso-generic-font-family:roman;
+ mso-font-format:other;
+ mso-font-pitch:variable;
+ mso-font-signature:0 0 0 0 0 0;}
+@font-face
+{font-family:Cambria;
+ panose-1:2 4 5 3 5 4 6 3 2 4;
+ mso-font-charset:0;
+ mso-generic-font-family:roman;
+ mso-font-pitch:variable;
+ mso-font-signature:-536870145 1073743103 0 0 415 0;}
+@font-face
+{font-family:Calibri;
+ panose-1:2 15 5 2 2 2 4 3 2 4;
+ mso-font-charset:0;
+ mso-generic-font-family:swiss;
+ mso-font-pitch:variable;
+ mso-font-signature:-520092929 1073786111 9 0 415 0;}
+ /* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+{mso-style-unhide:no;
+ mso-style-qformat:yes;
+ mso-style-parent:"";
+ margin-top:0in;
+ margin-right:0in;
+ margin-bottom:10.0pt;
+ margin-left:0in;
+ line-height:115%;
+ mso-pagination:widow-orphan;
+ font-size:11.0pt;
+ font-family:"Calibri","sans-serif";
+ mso-ascii-font-family:Calibri;
+ mso-ascii-theme-font:minor-latin;
+ mso-fareast-font-family:Calibri;
+ mso-fareast-theme-font:minor-latin;
+ mso-hansi-font-family:Calibri;
+ mso-hansi-theme-font:minor-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:minor-bidi;}
+h1
+{mso-style-priority:9;
+ mso-style-unhide:no;
+ mso-style-qformat:yes;
+ mso-style-link:"Heading 1 Char";
+ mso-style-next:Normal;
+ margin-top:24.0pt;
+ margin-right:0in;
+ margin-bottom:0in;
+ margin-left:0in;
+ margin-bottom:.0001pt;
+ line-height:115%;
+ mso-pagination:widow-orphan lines-together;
+ page-break-after:avoid;
+ mso-outline-level:1;
+ font-size:14.0pt;
+ font-family:"Cambria","serif";
+ mso-ascii-font-family:Cambria;
+ mso-ascii-theme-font:major-latin;
+ mso-fareast-font-family:"Times New Roman";
+ mso-fareast-theme-font:major-fareast;
+ mso-hansi-font-family:Cambria;
+ mso-hansi-theme-font:major-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:major-bidi;
+ color:#365F91;
+ mso-themecolor:accent1;
+ mso-themeshade:191;
+ mso-font-kerning:0pt;}
+p.MsoTitle, li.MsoTitle, div.MsoTitle
+{mso-style-priority:10;
+ mso-style-unhide:no;
+ mso-style-qformat:yes;
+ mso-style-link:"Title Char";
+ mso-style-next:Normal;
+ margin-top:0in;
+ margin-right:0in;
+ margin-bottom:15.0pt;
+ margin-left:0in;
+ mso-add-space:auto;
+ mso-pagination:widow-orphan;
+ border:none;
+ mso-border-bottom-alt:solid #4F81BD 1.0pt;
+ mso-border-bottom-themecolor:accent1;
+ padding:0in;
+ mso-padding-alt:0in 0in 4.0pt 0in;
+ font-size:26.0pt;
+ font-family:"Cambria","serif";
+ mso-ascii-font-family:Cambria;
+ mso-ascii-theme-font:major-latin;
+ mso-fareast-font-family:"Times New Roman";
+ mso-fareast-theme-font:major-fareast;
+ mso-hansi-font-family:Cambria;
+ mso-hansi-theme-font:major-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:major-bidi;
+ color:#17365D;
+ mso-themecolor:text2;
+ mso-themeshade:191;
+ letter-spacing:.25pt;
+ mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
+{mso-style-priority:10;
+ mso-style-unhide:no;
+ mso-style-qformat:yes;
+ mso-style-link:"Title Char";
+ mso-style-next:Normal;
+ mso-style-type:export-only;
+ margin:0in;
+ margin-bottom:.0001pt;
+ mso-add-space:auto;
+ mso-pagination:widow-orphan;
+ border:none;
+ mso-border-bottom-alt:solid #4F81BD 1.0pt;
+ mso-border-bottom-themecolor:accent1;
+ padding:0in;
+ mso-padding-alt:0in 0in 4.0pt 0in;
+ font-size:26.0pt;
+ font-family:"Cambria","serif";
+ mso-ascii-font-family:Cambria;
+ mso-ascii-theme-font:major-latin;
+ mso-fareast-font-family:"Times New Roman";
+ mso-fareast-theme-font:major-fareast;
+ mso-hansi-font-family:Cambria;
+ mso-hansi-theme-font:major-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:major-bidi;
+ color:#17365D;
+ mso-themecolor:text2;
+ mso-themeshade:191;
+ letter-spacing:.25pt;
+ mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
+{mso-style-priority:10;
+ mso-style-unhide:no;
+ mso-style-qformat:yes;
+ mso-style-link:"Title Char";
+ mso-style-next:Normal;
+ mso-style-type:export-only;
+ margin:0in;
+ margin-bottom:.0001pt;
+ mso-add-space:auto;
+ mso-pagination:widow-orphan;
+ border:none;
+ mso-border-bottom-alt:solid #4F81BD 1.0pt;
+ mso-border-bottom-themecolor:accent1;
+ padding:0in;
+ mso-padding-alt:0in 0in 4.0pt 0in;
+ font-size:26.0pt;
+ font-family:"Cambria","serif";
+ mso-ascii-font-family:Cambria;
+ mso-ascii-theme-font:major-latin;
+ mso-fareast-font-family:"Times New Roman";
+ mso-fareast-theme-font:major-fareast;
+ mso-hansi-font-family:Cambria;
+ mso-hansi-theme-font:major-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:major-bidi;
+ color:#17365D;
+ mso-themecolor:text2;
+ mso-themeshade:191;
+ letter-spacing:.25pt;
+ mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
+{mso-style-priority:10;
+ mso-style-unhide:no;
+ mso-style-qformat:yes;
+ mso-style-link:"Title Char";
+ mso-style-next:Normal;
+ mso-style-type:export-only;
+ margin-top:0in;
+ margin-right:0in;
+ margin-bottom:15.0pt;
+ margin-left:0in;
+ mso-add-space:auto;
+ mso-pagination:widow-orphan;
+ border:none;
+ mso-border-bottom-alt:solid #4F81BD 1.0pt;
+ mso-border-bottom-themecolor:accent1;
+ padding:0in;
+ mso-padding-alt:0in 0in 4.0pt 0in;
+ font-size:26.0pt;
+ font-family:"Cambria","serif";
+ mso-ascii-font-family:Cambria;
+ mso-ascii-theme-font:major-latin;
+ mso-fareast-font-family:"Times New Roman";
+ mso-fareast-theme-font:major-fareast;
+ mso-hansi-font-family:Cambria;
+ mso-hansi-theme-font:major-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:major-bidi;
+ color:#17365D;
+ mso-themecolor:text2;
+ mso-themeshade:191;
+ letter-spacing:.25pt;
+ mso-font-kerning:14.0pt;}
+span.TitleChar
+{mso-style-name:"Title Char";
+ mso-style-priority:10;
+ mso-style-unhide:no;
+ mso-style-locked:yes;
+ mso-style-link:Title;
+ mso-ansi-font-size:26.0pt;
+ mso-bidi-font-size:26.0pt;
+ font-family:"Cambria","serif";
+ mso-ascii-font-family:Cambria;
+ mso-ascii-theme-font:major-latin;
+ mso-fareast-font-family:"Times New Roman";
+ mso-fareast-theme-font:major-fareast;
+ mso-hansi-font-family:Cambria;
+ mso-hansi-theme-font:major-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:major-bidi;
+ color:#17365D;
+ mso-themecolor:text2;
+ mso-themeshade:191;
+ letter-spacing:.25pt;
+ mso-font-kerning:14.0pt;}
+span.Heading1Char
+{mso-style-name:"Heading 1 Char";
+ mso-style-priority:9;
+ mso-style-unhide:no;
+ mso-style-locked:yes;
+ mso-style-link:"Heading 1";
+ mso-ansi-font-size:14.0pt;
+ mso-bidi-font-size:14.0pt;
+ font-family:"Cambria","serif";
+ mso-ascii-font-family:Cambria;
+ mso-ascii-theme-font:major-latin;
+ mso-fareast-font-family:"Times New Roman";
+ mso-fareast-theme-font:major-fareast;
+ mso-hansi-font-family:Cambria;
+ mso-hansi-theme-font:major-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:major-bidi;
+ color:#365F91;
+ mso-themecolor:accent1;
+ mso-themeshade:191;
+ font-weight:bold;}
+.MsoChpDefault
+{mso-style-type:export-only;
+ mso-default-props:yes;
+ font-family:"Calibri","sans-serif";
+ mso-ascii-font-family:Calibri;
+ mso-ascii-theme-font:minor-latin;
+ mso-fareast-font-family:Calibri;
+ mso-fareast-theme-font:minor-latin;
+ mso-hansi-font-family:Calibri;
+ mso-hansi-theme-font:minor-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:minor-bidi;}
+.MsoPapDefault
+{mso-style-type:export-only;
+ margin-bottom:10.0pt;
+ line-height:115%;}
+@page WordSection1
+{size:8.5in 11.0in;
+ margin:1.0in 1.0in 1.0in 1.0in;
+ mso-header-margin:.5in;
+ mso-footer-margin:.5in;
+ mso-paper-source:0;}
+div.WordSection1
+{page:WordSection1;}
+-->
+</style>
+<!--[if gte mso 10]>
+<style>
+ /* Style Definitions */
+ table.MsoNormalTable
+ {mso-style-name:"Table Normal";
+ mso-tstyle-rowband-size:0;
+ mso-tstyle-colband-size:0;
+ mso-style-noshow:yes;
+ mso-style-priority:99;
+ mso-style-parent:"";
+ mso-padding-alt:0in 5.4pt 0in 5.4pt;
+ mso-para-margin-top:0in;
+ mso-para-margin-right:0in;
+ mso-para-margin-bottom:10.0pt;
+ mso-para-margin-left:0in;
+ line-height:115%;
+ mso-pagination:widow-orphan;
+ font-size:11.0pt;
+ font-family:"Calibri","sans-serif";
+ mso-ascii-font-family:Calibri;
+ mso-ascii-theme-font:minor-latin;
+ mso-hansi-font-family:Calibri;
+ mso-hansi-theme-font:minor-latin;
+ mso-bidi-font-family:"Times New Roman";
+ mso-bidi-theme-font:minor-bidi;}
+</style>
+<![endif]--><!--[if gte mso 9]><xml>
+ <o:shapedefaults v:ext="edit" spidmax="1026"/>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+ <o:shapelayout v:ext="edit">
+ <o:idmap v:ext="edit" data="1"/>
+ </o:shapelayout></xml><![endif]-->
+</head>
+
+<body lang=EN-US style='tab-interval:.5in'>
+
+<div class=WordSection1>
+
+ <div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
+mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
+
+ <p class=MsoTitle>This is a test</p>
+
+ </div>
+
+</div>
+
+</body>
+
+</html>
+
Modified: lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml Tue Jan 24 15:51:55 2012
@@ -103,7 +103,24 @@ are part of the ICU4C package. See http:
</assertions>
</java>
</target>
-
+
+ <property name="html.strip.charfilter.supp.macros.output.file"
+ location="../analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
+
+ <target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
+ <java
+ classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
+ dir="."
+ fork="true"
+ failonerror="true"
+ output="${html.strip.charfilter.supp.macros.output.file}">
+ <classpath>
+ <path refid="additional.dependencies"/>
+ <pathelement location="${build.dir}/classes/tools"/>
+ </classpath>
+ </java>
+ </target>
+
<target name="compile-tools" depends="common.compile-tools">
<compile
srcdir="src/tools/java"
Added: lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java Tue Jan 24 15:51:55 2012
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DateFormat;
+import java.util.*;
+
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.VersionInfo;
+
+/** creates a macro to augment jflex's unicode support for > BMP */
+public class GenerateHTMLStripCharFilterSupplementaryMacros {
+ private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
+ private static final String NL = System.getProperty("line.separator");
+ private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
+ (DateFormat.FULL, DateFormat.FULL, Locale.US);
+ static {
+ DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
+ }
+
+ private static final String APACHE_LICENSE
+ = "/*" + NL
+ + " * Copyright 2010 The Apache Software Foundation." + NL
+ + " *" + NL
+ + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ + " * you may not use this file except in compliance with the License." + NL
+ + " * You may obtain a copy of the License at" + NL
+ + " *" + NL
+ + " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ + " *" + NL
+ + " * Unless required by applicable law or agreed to in writing, software" + NL
+ + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ + " * See the License for the specific language governing permissions and" + NL
+ + " * limitations under the License." + NL
+ + " */" + NL + NL;
+
+
+ public static void main(String args[]) throws Exception {
+ outputHeader();
+ outputMacro("ID_Start_Supp", "[:ID_Start:]");
+ outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
+ }
+
+ static void outputHeader() {
+ System.out.print(APACHE_LICENSE);
+ System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
+ System.out.println(DATE_FORMAT.format(new Date()));
+ System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
+ System.out.print(NL + NL);
+ }
+
+ // we have to carefully output the possibilities as compact utf-16
+ // range expressions, or jflex will OOM!
+ static void outputMacro(String name, String pattern) {
+ UnicodeSet set = new UnicodeSet(pattern);
+ set.removeAll(BMP);
+ System.out.println(name + " = (");
+ // if the set is empty, we have to do this or jflex will barf
+ if (set.isEmpty()) {
+ System.out.println("\t []");
+ }
+
+ HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
+ for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+ char utf16[] = Character.toChars(it.codepoint);
+ UnicodeSet trails = utf16ByLead.get(utf16[0]);
+ if (trails == null) {
+ trails = new UnicodeSet();
+ utf16ByLead.put(utf16[0], trails);
+ }
+ trails.add(utf16[1]);
+ }
+
+ Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
+ for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
+ String trail = entry.getValue().getRegexEquivalent();
+ UnicodeSet leads = utf16ByTrail.get(trail);
+ if (leads == null) {
+ leads = new UnicodeSet();
+ utf16ByTrail.put(trail, leads);
+ }
+ leads.add(entry.getKey());
+ }
+
+ boolean isFirst = true;
+ for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
+ System.out.print( isFirst ? "\t " : "\t| ");
+ isFirst = false;
+ System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
+ }
+ System.out.println(")");
+ }
+}
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java Tue Jan 24 15:51:55 2012
@@ -19,11 +19,46 @@ package org.apache.lucene.analysis;
import org.apache.lucene.util.ArrayUtil;
+import java.util.Arrays;
+
/**
- * Base utility class for implementing a {@link CharFilter}.
- * You subclass this, and then record mappings by calling
- * {@link #addOffCorrectMap}, and then invoke the correct
- * method to correct an offset.
+ * <p>
+ * Base utility class for implementing a {@link CharFilter}.
+ * You subclass this, and then record mappings by calling
+ * {@link #addOffCorrectMap}, and then invoke the correct
+ * method to correct an offset.
+ * </p>
+ + <p>
+ + CharFilters modify an input stream via a series of substring
+ + replacements (including deletions and insertions) to produce an output
+ + stream. There are three possible replacement cases: the replacement
+ + string has the same length as the original substring; the replacement
+ + is shorter; and the replacement is longer. In the latter two cases
+ + (when the replacement has a different length than the original),
+ + one or more offset correction mappings are required.
+ + </p>
+ + <p>
+ + When the replacement is shorter than the original (e.g. when the
+ + replacement is the empty string), a single offset correction mapping
+ + should be added at the replacement's end offset in the output stream.
+ + The <code>cumulativeDiff</code> parameter to the
+ + <code>addOffCorrectMapping()</code> method will be the sum of all
+ + previous replacement offset adjustments, with the addition of the
+ + difference between the lengths of the original substring and the
+ + replacement string (a positive value).
+ + </p>
+ + <p>
+ + When the replacement is longer than the original (e.g. when the
+ + original is the empty string), you should add as many offset
+ + correction mappings as the difference between the lengths of the
+ + replacement string and the original substring, starting at the
+ + end offset the original substring would have had in the output stream.
+ + The <code>cumulativeDiff</code> parameter to the
+ + <code>addOffCorrectMapping()</code> method will be the sum of all
+ + previous replacement offset adjustments, with the addition of the
+ + difference between the lengths of the original substring and the
+ + replacement string so far (a negative value).
+ + </p>
*/
public abstract class BaseCharFilter extends CharFilter {
@@ -70,6 +105,19 @@ public abstract class BaseCharFilter ext
0 : diffs[size-1];
}
+ /**
+ * <p>
+ * Adds an offset correction mapping at the given output stream offset.
+ * </p>
+ * <p>
+ * Assumption: the offset given with each successive call to this method
+ * will not be smaller than the offset given at the previous invocation.
+ * </p>
+ *
+ * @param off The output stream offset at which to apply the correction
+ * @param cumulativeDiff The input offset is given by adding this
+ * to the output offset
+ */
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (offsets == null) {
offsets = new int[64];
@@ -79,7 +127,15 @@ public abstract class BaseCharFilter ext
diffs = ArrayUtil.grow(diffs);
}
- offsets[size] = off;
- diffs[size++] = cumulativeDiff;
+ assert (size == 0 || off >= offsets[size])
+ : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+ + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+
+ if (size == 0 || off != offsets[size - 1]) {
+ offsets[size] = off;
+ diffs[size++] = cumulativeDiff;
+ } else { // Overwrite the diff at the last recorded offset
+ diffs[size - 1] = cumulativeDiff;
+ }
}
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java Tue Jan 24 15:51:55 2012
@@ -266,7 +266,42 @@ public class _TestUtil {
}
}
- // TODO: make this more evil
+ private static final String[] HTML_CHAR_ENTITIES = {
+ "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
+ "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
+ "Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
+ "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
+ "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
+ "Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
+ "QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
+ "Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
+ "Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
+ "alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
+ "auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
+ "cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
+ "curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
+ "eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
+ "equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
+ "frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
+ "harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
+ "image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
+ "lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
+ "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
+ "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
+ "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
+ "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
+ "ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
+ "perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
+ "psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
+ "rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
+ "sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
+ "spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
+ "szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
+ "tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
+ "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
+ "yuml", "zeta", "zwj", "zwnj"
+ };
+
public static String randomHtmlishString(Random random, int numElements) {
final int end = random.nextInt(numElements);
if (end == 0) {
@@ -275,17 +310,80 @@ public class _TestUtil {
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < end; i++) {
- int val = random.nextInt(10);
+ int val = random.nextInt(25);
switch(val) {
case 0: sb.append("<p>"); break;
- case 1: sb.append("</p>"); break;
- case 2: sb.append("<!--"); break;
- case 3: sb.append("-->"); break;
- case 4: sb.append("&#"); break;
- case 5: sb.append(";"); break;
- case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
- default:
- sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+ case 1: {
+ sb.append("<");
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append(randomSimpleString(random));
+ for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
+ sb.append(' ');
+ sb.append(randomSimpleString(random));
+ sb.append(" ".substring(nextInt(random, 0, 1)));
+ sb.append('=');
+ sb.append(" ".substring(nextInt(random, 0, 1)));
+ sb.append("\"".substring(nextInt(random, 0, 1)));
+ sb.append(randomSimpleString(random));
+ sb.append("\"".substring(nextInt(random, 0, 1)));
+ }
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append("/".substring(nextInt(random, 0, 1)));
+ sb.append(">".substring(nextInt(random, 0, 1)));
+ break;
+ }
+ case 2: {
+ sb.append("</");
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append(randomSimpleString(random));
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append(">".substring(nextInt(random, 0, 1)));
+ break;
+ }
+ case 3: sb.append(">"); break;
+ case 4: sb.append("</p>"); break;
+ case 5: sb.append("<!--"); break;
+ case 6: sb.append("<!--#"); break;
+ case 7: sb.append("<script><!-- f('"); break;
+ case 8: sb.append("</script>"); break;
+ case 9: sb.append("<?"); break;
+ case 10: sb.append("?>"); break;
+ case 11: sb.append("\""); break;
+ case 12: sb.append("\\\""); break;
+ case 13: sb.append("'"); break;
+ case 14: sb.append("\\'"); break;
+ case 15: sb.append("-->"); break;
+ case 16: {
+ sb.append("&");
+ switch(nextInt(random, 0, 2)) {
+ case 0: sb.append(randomSimpleString(random)); break;
+ case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
+ }
+ sb.append(";".substring(nextInt(random, 0, 1)));
+ break;
+ }
+ case 17: {
+ sb.append("&#");
+ if (0 == nextInt(random, 0, 1)) {
+ sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
+ sb.append(";".substring(nextInt(random, 0, 1)));
+ }
+ break;
+ }
+ case 18: {
+ sb.append("&#x");
+ if (0 == nextInt(random, 0, 1)) {
+ sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
+ sb.append(";".substring(nextInt(random, 0, 1)));
+ }
+ break;
+ }
+
+ case 19: sb.append(";"); break;
+ case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
+ case 21: sb.append("\n");
+ case 22: sb.append(" ".substring(nextInt(random, 0, 10)));
+ default: sb.append(randomSimpleString(random));
}
}
return sb.toString();
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Tue Jan 24 15:51:55 2012
@@ -28,6 +28,14 @@ Upgrading from Solr 3.5
* As doGet() methods in SimplePostTool was changed to static, the client applications of this
class need to be recompiled.
+* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
+ character offsets it provided, triggering e.g. exceptions in highlighting.
+ HTMLStripCharFilter has been re-implemented, addressing this and other
+ issues. See the entry for LUCENE-3690 in the Bug Fixes section below for a
+ detailed list of changes. For people who depend on the behavior of
+ HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
+ (bugs and all) is preserved as LegacyHTMLStripCharFilter.
+
New Features
----------------------
* SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
@@ -119,6 +127,47 @@ Bug Fixes
* SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)
+* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
+ HTMLStripCharFilter as a JFlex-generated scanner and moved it to
+ lucene/contrib/analyzers/common/. See below for a list of bug fixes and
+ other changes. To get the same behavior as HTMLStripCharFilter in Solr
+ version 3.5 and earlier (including the bugs), use LegacyHTMLStripCharFilter,
+ which is the previous implementation.
+
+ Behavior changes from the previous version:
+
+ - Known offset bugs are fixed.
+ - The "Mark invalid" exceptions reported in SOLR-1283 are no longer
+ triggered (the bug is still present in LegacyHTMLStripCharFilter).
+ - The character entity "'" is now always properly decoded.
+ - More cases of <script> tags are now properly stripped.
+ - CDATA sections are now handled properly.
+ - Valid tag name characters now include the supplementary Unicode characters
+ from Unicode character classes [:ID_Start:] and [:ID_Continue:].
+ - Uppercase character entities """, "©", ">", "<", "®",
+ and "&" are now recognized and handled as if they were in lowercase.
+ - The REPLACEMENT CHARACTER U+FFFD is now used to replace numeric character
+ entities for unpaired UTF-16 low and high surrogates (in the range
+ [U+D800-U+DFFF]).
+ - Properly paired numeric character entities for UTF-16 surrogates are now
+ converted to the corresponding code units.
+ - Opening tags with unbalanced quotation marks are now properly stripped.
+ - Literal "<" and ">" characters in opening tags, regardless of whether they
+ appear inside quotation marks, now inhibit recognition (and stripping) of
+ the tags. The only exception to this is for values of event-handler
+ attributes, e.g. "onClick", "onLoad", "onSelect".
+ - A newline '\n' is substituted instead of a space for stripped HTML markup.
+ - Nothing is substituted for opening and closing inline tags - they are
+ simply removed. The list of inline tags is (case insensitively): <a>,
+ <abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
+ <em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
+ <select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
+ <tt>, <u>, and <var>.
+ - HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
+ feature: opening and closing tags with the given names, including any
+ attributes and their values, are left intact in the output.
+ (Steve Rowe)
+
* LUCENE-3717: Fixed offset bugs in TrimFilter, WordDelimiterFilter, and
HyphenatedWordsFilter where they would create invalid offsets in
some situations, leading to problems in highlighting. (Robert Muir)