You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/03/09 19:18:42 UTC

svn commit: r1079915 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Author: jukka
Date: Wed Mar  9 18:18:42 2011
New Revision: 1079915

URL: http://svn.apache.org/viewvc?rev=1079915&view=rev
Log:
TIKA-599: Thread issue with autodetect parser

The shared HTMLSchema instance introduced in TIKA-528 is actually only
thread-safe (and reusable) when the ignoreBogons option is enabled.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1079915&r1=1079914&r2=1079915&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Mar  9 18:18:42 2011
@@ -21,7 +21,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -67,6 +66,9 @@ public class HtmlParser implements Parse
                     "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
                     "([^'\\\"]+)['\\\"]");
 
+    /**
+     * HTML schema singleton used to amortize the heavy instantiation time.
+     */
     private static final Schema HTML_SCHEMA = new HTMLSchema();
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -190,9 +192,13 @@ public class HtmlParser implements Parse
         org.ccil.cowan.tagsoup.Parser parser =
             new org.ccil.cowan.tagsoup.Parser();
 
-        // Instantiating HTMLSchema is heavy, therefore reuse a cached instance
-        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
-        
+        // TIKA-528: Reuse share schema to avoid heavy instantiation
+        parser.setProperty(
+                org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
+        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+        parser.setFeature(
+                org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
         parser.setContentHandler(new XHTMLDowngradeHandler(
                 new HtmlHandler(mapper, handler, metadata)));
         parser.parse(source);