You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 00:56:04 UTC
svn commit: r891074 - in /lucene/tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/HtmlParser.java
test/java/org/apache/tika/parser/html/HtmlParserTest.java
Author: jukka
Date: Tue Dec 15 23:56:03 2009
New Revision: 891074
URL: http://svn.apache.org/viewvc?rev=891074&view=rev
Log:
TIKA-349: HtmlParser's http-equiv code needs to be more flexible
Patch by Ken Krugler
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891074&r1=891073&r2=891074&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Tue Dec 15 23:56:03 2009
@@ -45,10 +45,11 @@
// Use the widest, most common charset as our default.
private static final String DEFAULT_CHARSET = "windows-1252";
private static final int META_TAG_BUFFER_SIZE = 4096;
- private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern.compile(
- "(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+"
- + "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\"");
-
+ private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile(
+ "(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*" +
+ "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
+ "([^'\\\"]+)['\\\"]\\s*/>");
+
private static final Pattern CONTENT_TYPE_PATTERN =
Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
@@ -67,12 +68,20 @@
if (bufferSize != -1) {
String metaString = new String(buffer, 0, bufferSize);
- Matcher m = HTTP_EQUIV_CHARSET_PATTERN.matcher(metaString);
+ Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
if (m.find()) {
- String charset = m.group(1);
- if (Charset.isSupported(charset)) {
- metadata.set(Metadata.CONTENT_ENCODING, charset);
- return charset;
+ // TIKA-349: flexible handling of attributes
+ // We have one or more x or x=y attributes, separated by ';'
+ String[] attrs = m.group(1).split(";");
+ for (String attr : attrs) {
+ String[] keyValue = attr.trim().split("=");
+ if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
+ String charset = keyValue[1];
+ if (Charset.isSupported(charset)) {
+ metadata.set(Metadata.CONTENT_ENCODING, charset);
+ return charset;
+ }
+ }
}
}
}
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=891074&r1=891073&r2=891074&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Tue Dec 15 23:56:03 2009
@@ -252,7 +252,7 @@
/**
* Test case for TIKA-341
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-XXX">TIKA-XXX</a>
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
*/
public void testUsingCharsetInContentTypeHeader() throws Exception {
final String test =
@@ -307,4 +307,33 @@
assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
}
+ /**
+ * Test case for TIKA-349
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
+ */
+ public void testHttpEquivCharsetFunkyAttributes() throws Exception {
+ String test1 =
+ "<html><head><meta http-equiv=\"content-type\""
+ + " content=\"text/html; charset=ISO-8859-1; charset=iso-8859-1\" />"
+ + "<title>the name is \u00e1ndre</title>"
+ + "</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test1.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+
+ // Some HTML pages have errors like ';;' versus '; ' as separator
+ String test2 =
+ "<html><head><meta http-equiv=\"content-type\""
+ + " content=\"text/html;;charset=ISO-8859-1\" />"
+ + "<title>the name is \u00e1ndre</title>"
+ + "</head><body></body></html>";
+ metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
}