You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2013/12/28 02:14:27 UTC
svn commit: r1553774 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/HtmlParser.java
test/java/org/apache/tika/parser/html/HtmlParserTest.java
Author: jukka
Date: Sat Dec 28 01:14:27 2013
New Revision: 1553774
URL: http://svn.apache.org/r1553774
Log:
TIKA-1193: Allow access to HtmlParser's HtmlSchema
Patch by Markus Jelsma
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1553774&r1=1553773&r2=1553774&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sat Dec 28 01:14:27 2013
@@ -92,9 +92,12 @@ public class HtmlParser extends Abstract
org.ccil.cowan.tagsoup.Parser parser =
new org.ccil.cowan.tagsoup.Parser();
+ // Use schema from context or default
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(
- org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
+ org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser.setFeature(
org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1553774&r1=1553773&r2=1553774&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sat Dec 28 01:14:27 2013
@@ -42,7 +42,10 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.Attributes;
@@ -890,4 +893,36 @@ public class HtmlParserTest {
assertTrue("testing: " +fileName, content.contains(hit));
}
}
+
+ // TIKA-1193
+ @Test
+ public void testCustomHtmlSchema() throws Exception {
+ // Default schema does not allow tables inside anchors
+ String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
+
+ Metadata metadata = new Metadata();
+ LinkContentHandler linkContentHandler = new LinkContentHandler();
+
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
+ linkContentHandler, metadata, new ParseContext());
+
+ // Expect no anchor text
+ assertEquals("", linkContentHandler.getLinks().get(0).getText());
+
+ // We'll change the schema to allow tables inside anchors!
+ Schema schema = new HTMLSchema();
+ schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(Schema.class, schema);
+ linkContentHandler = new LinkContentHandler();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
+ linkContentHandler, metadata, parseContext);
+
+ // Expect anchor text
+ assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
+ }
+
}