You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/03/09 19:18:42 UTC
svn commit: r1079915 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Author: jukka
Date: Wed Mar 9 18:18:42 2011
New Revision: 1079915
URL: http://svn.apache.org/viewvc?rev=1079915&view=rev
Log:
TIKA-599: Thread issue with autodetect parser
The shared HTMLSchema instance introduced in TIKA-528 is actually only
thread-safe (and reusable) when the ignoreBogons option is enabled.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1079915&r1=1079914&r2=1079915&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Mar 9 18:18:42 2011
@@ -21,7 +21,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
@@ -67,6 +66,9 @@ public class HtmlParser implements Parse
"Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
"([^'\\\"]+)['\\\"]");
+ /**
+ * HTML schema singleton used to amortize the heavy instantiation time.
+ */
private static final Schema HTML_SCHEMA = new HTMLSchema();
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -190,9 +192,13 @@ public class HtmlParser implements Parse
org.ccil.cowan.tagsoup.Parser parser =
new org.ccil.cowan.tagsoup.Parser();
- // Instantiating HTMLSchema is heavy, therefore reuse a cached instance
- parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
-
+ // TIKA-528: Reuse share schema to avoid heavy instantiation
+ parser.setProperty(
+ org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
+ // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+ parser.setFeature(
+ org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
parser.setContentHandler(new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata)));
parser.parse(source);