You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2013/12/28 02:14:27 UTC

svn commit: r1553774 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/HtmlParser.java test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: jukka
Date: Sat Dec 28 01:14:27 2013
New Revision: 1553774

URL: http://svn.apache.org/r1553774
Log:
TIKA-1193: Allow access to HtmlParser's HtmlSchema

Patch by Markus Jelsma

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1553774&r1=1553773&r2=1553774&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sat Dec 28 01:14:27 2013
@@ -92,9 +92,12 @@ public class HtmlParser extends Abstract
             org.ccil.cowan.tagsoup.Parser parser =
                     new org.ccil.cowan.tagsoup.Parser();
 
+            // Use schema from context or default
+            Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
             // TIKA-528: Reuse share schema to avoid heavy instantiation
             parser.setProperty(
-                    org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
+                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
             // TIKA-599: Shared schema is thread-safe only if bogons are ignored
             parser.setFeature(
                     org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1553774&r1=1553773&r2=1553774&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sat Dec 28 01:14:27 2013
@@ -42,7 +42,10 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.Attributes;
@@ -890,4 +893,36 @@ public class HtmlParserTest {
           assertTrue("testing: " +fileName, content.contains(hit));
        }
     }
+
+    // TIKA-1193
+    @Test
+    public void testCustomHtmlSchema() throws Exception {
+        // Default schema does not allow tables inside anchors
+        String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
+
+        Metadata metadata = new Metadata();
+        LinkContentHandler linkContentHandler = new LinkContentHandler();
+
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
+                linkContentHandler, metadata, new ParseContext());
+
+        // Expect no anchor text
+        assertEquals("", linkContentHandler.getLinks().get(0).getText());
+
+        // We'll change the schema to allow tables inside anchors!
+        Schema schema = new HTMLSchema();
+        schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(Schema.class, schema);
+        linkContentHandler = new LinkContentHandler();
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
+                linkContentHandler, metadata, parseContext);
+
+        // Expect anchor text
+        assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
+    }
+
 }