You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/01/22 06:20:47 UTC
svn commit: r1234452 [5/5] - in /lucene/dev/trunk: lucene/
lucene/src/test-framework/java/org/apache/lucene/util/
modules/analysis/common/
modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/
modules/analysis/common/src/test/org/apac...
Copied: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java (from r1233197, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java?p2=lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java&p1=lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java&r1=1233197&r2=1234452&rev=1234452&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java Sun Jan 22 05:20:46 2012
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.charfilter;
+package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -31,9 +31,10 @@ import org.apache.lucene.analysis.BaseTo
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util._TestUtil;
import org.junit.Ignore;
-public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+public class LegacyHTMLStripCharFilterTest extends BaseTokenStreamTestCase {
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
//
@@ -44,7 +45,7 @@ public class HTMLStripCharFilterTest ext
String gold = " this is some text here is a link and " +
"another link . " +
"This is an entity: & plus a <. Here is an &. ";
- HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
+ LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(html)));
StringBuilder builder = new StringBuilder();
int ch = -1;
char [] goldArray = gold.toCharArray();
@@ -62,7 +63,7 @@ public class HTMLStripCharFilterTest ext
//Some sanity checks, but not a full-fledged check
public void testHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
- HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
+ LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
StringBuilder builder = new StringBuilder();
int ch = -1;
while ((ch = reader.read()) != -1){
@@ -82,7 +83,7 @@ public class HTMLStripCharFilterTest ext
String gold = "\u0393";
Set<String> set = new HashSet<String>();
set.add("reserved");
- Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+ Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@@ -99,7 +100,7 @@ public class HTMLStripCharFilterTest ext
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
Set<String> set = new HashSet<String>();
set.add("reserved");
- Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+ Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@@ -116,7 +117,7 @@ public class HTMLStripCharFilterTest ext
String gold = " <junk/> ! @ and â";
Set<String> set = new HashSet<String>();
set.add("reserved");
- Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+ Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@@ -132,7 +133,7 @@ public class HTMLStripCharFilterTest ext
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
Set<String> set = new HashSet<String>();
set.add("reserved");
- Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+ Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@@ -149,7 +150,7 @@ public class HTMLStripCharFilterTest ext
public void testMalformedHTML() throws Exception {
String test = "a <a hr<ef=aa<a>> </close</a>";
String gold = "a <a hr<ef=aa > </close ";
- Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+ Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
@@ -162,27 +163,27 @@ public class HTMLStripCharFilterTest ext
}
public void testBufferOverflow() throws Exception {
- StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
+ StringBuilder testBuilder = new StringBuilder(LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
testBuilder.append("ah<?> ??????");
- appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+ appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
testBuilder.setLength(0);
testBuilder.append("<!--");//comments
- appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+ appendChars(testBuilder, 3*LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
testBuilder.append("-->foo");
processBuffer(testBuilder.toString(), "Failed w/ comment");
testBuilder.setLength(0);
testBuilder.append("<?");
- appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+ appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
testBuilder.append("?>");
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
testBuilder.setLength(0);
testBuilder.append("<b ");
- appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+ appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
testBuilder.append("/>");
processBuffer(testBuilder.toString(), "Failed on tag");
@@ -191,14 +192,14 @@ public class HTMLStripCharFilterTest ext
private void appendChars(StringBuilder testBuilder, int numChars) {
int i1 = numChars / 2;
for (int i = 0; i < i1; i++){
- testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripCharFilter think it is a processing instruction
+ testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes LegacyHTMLStripCharFilter think it is a processing instruction
}
}
private void processBuffer(String test, String assertMsg) throws IOException {
// System.out.println("-------------------processBuffer----------");
- Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
+ Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@@ -215,7 +216,7 @@ public class HTMLStripCharFilterTest ext
String test = "<!--- three dashes, still a valid comment ---> ";
String gold = " ";
- Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
+ Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
@@ -230,7 +231,7 @@ public class HTMLStripCharFilterTest ext
public void doTestOffsets(String in) throws Exception {
- HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+ LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
int ch = 0;
int off = 0; // offset in the reader
int strOff = -1; // offset in the original string
@@ -267,11 +268,54 @@ public class HTMLStripCharFilterTest ext
@Override
protected Reader initReader(Reader reader) {
- return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+ return new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
}
};
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
+
+ public void testRandomBrokenHTML() throws Exception {
+ int maxNumElements = 10000;
+ String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+ Reader reader
+ = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(text)));
+ while (reader.read() != -1);
+ }
+
+ public void testRandomText() throws Exception {
+ StringBuilder text = new StringBuilder();
+ int minNumWords = 10;
+ int maxNumWords = 10000;
+ int minWordLength = 3;
+ int maxWordLength = 20;
+ int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+ switch (_TestUtil.nextInt(random, 0, 4)) {
+ case 0: {
+ for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+ text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+ text.append(' ');
+ }
+ break;
+ }
+ case 1: {
+ for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+ text.append(_TestUtil.randomRealisticUnicodeString
+ (random, minWordLength, maxWordLength));
+ text.append(' ');
+ }
+ break;
+ }
+ default: { // ASCII 50% of the time
+ for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+ text.append(_TestUtil.randomSimpleString(random));
+ text.append(' ');
+ }
+ }
+ }
+ Reader reader = new LegacyHTMLStripCharFilter
+ (CharReader.get(new StringReader(text.toString())));
+ while (reader.read() != -1);
+ }
}
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java?rev=1234452&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java Sun Jan 22 05:20:46 2012
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure this factory is working
+ */
+public class TestHTMLStripCharFilterFactory extends BaseTokenTestCase {
+
+
+ public void testNothingChanged() throws IOException {
+ // 11111111112
+ // 012345678901234567890
+ final String text = "this is only a test.";
+ HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("escapedTags", "a, Title");
+ factory.init(args);
+ CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts,
+ new String[] { "this", "is", "only", "a", "test." },
+ new int[] { 0, 5, 8, 13, 15 },
+ new int[] { 4, 7, 12, 14, 20 });
+ }
+
+ public void testNoEscapedTags() throws IOException {
+ // 11111111112222222222333333333344
+ // 012345678901234567890123456789012345678901
+ final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+ HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ factory.init(args);
+ CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts,
+ new String[] { "this", "is", "only", "a", "test." },
+ new int[] { 3, 12, 18, 27, 32 },
+ new int[] { 11, 14, 26, 28, 41 });
+ }
+
+ public void testEscapedTags() throws IOException {
+ // 11111111112222222222333333333344
+ // 012345678901234567890123456789012345678901
+ final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+ HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("escapedTags", "U i");
+ factory.init(args);
+ CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts,
+ new String[] { "<u>this</u>", "is", "only", "a", "<I>test</I>." },
+ new int[] { 0, 12, 18, 27, 29 },
+ new int[] { 11, 14, 26, 28, 41 });
+ }
+
+ public void testSeparatorOnlyEscapedTags() throws IOException {
+ // 11111111112222222222333333333344
+ // 012345678901234567890123456789012345678901
+ final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+ HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("escapedTags", ",, , ");
+ factory.init(args);
+ CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts,
+ new String[] { "this", "is", "only", "a", "test." },
+ new int[] { 3, 12, 18, 27, 32 },
+ new int[] { 11, 14, 26, 28, 41 });
+ }
+
+ public void testEmptyEscapedTags() throws IOException {
+ // 11111111112222222222333333333344
+ // 012345678901234567890123456789012345678901
+ final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+ HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("escapedTags", "");
+ factory.init(args);
+ CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts,
+ new String[] { "this", "is", "only", "a", "test." },
+ new int[] { 3, 12, 18, 27, 32 },
+ new int[] { 11, 14, 26, 28, 41 });
+ }
+
+ public void testSingleEscapedTag() throws IOException {
+ // 11111111112222222222333333333344
+ // 012345678901234567890123456789012345678901
+ final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+ HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("escapedTags", ", B\r\n\t");
+ factory.init(args);
+ CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+ TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+ assertTokenStreamContents(ts,
+ new String[] { "this", "is", "<b>only</b>", "a", "test." },
+ new int[] { 3, 12, 15, 27, 32 },
+ new int[] { 11, 14, 26, 28, 41 });
+ }
+}
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java?rev=1234452&r1=1234451&r2=1234452&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java Sun Jan 22 05:20:46 2012
@@ -326,8 +326,8 @@ public class FieldAnalysisRequestHandler
NamedList indexPart = textType.get("index");
assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
- assertEquals(" whátëvêr ", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
- assertEquals(" whatever ", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
+ assertEquals("\n\nwhátëvêr\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
+ assertEquals("\n\nwhatever\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
List<NamedList> tokenList = (List<NamedList>)indexPart.get(MockTokenizer.class.getName());
assertNotNull("Expecting MockTokenizer analysis breakdown", tokenList);