You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nutch.apache.org by pk...@apache.org on 2006/03/09 22:13:44 UTC

svn commit: r384617 - in /lucene/nutch/branches/branch-0.7: CHANGES.txt src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Author: pkosiorowski
Date: Thu Mar  9 13:13:42 2006
New Revision: 384617

URL: http://svn.apache.org/viewcvs?rev=384617&view=rev
Log:
NUTCH-91 - empty encoding causes exception. (Michael Nebel)

Modified:
    lucene/nutch/branches/branch-0.7/CHANGES.txt
    lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: lucene/nutch/branches/branch-0.7/CHANGES.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=384617&r1=384616&r2=384617&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.7/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.7/CHANGES.txt Thu Mar  9 13:13:42 2006
@@ -16,6 +16,8 @@
  6. Fixed TestFetcher JUnit test failing due to changes in www.nutch.org
 website.
 
+ 7. NUTCH-91 - empty encoding causes exception. (Michael Nebel).
+
 
 Release 0.7.1 - 2005-10-01
 

Modified: lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=384617&r1=384616&r2=384617&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Thu Mar  9 13:13:42 2006
@@ -120,7 +120,7 @@
       byte[] contentInOctets = content.getContent();
       InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
       String encoding = StringUtil.parseCharacterEncoding(contentType);
-      if (encoding!=null) {
+      if ((encoding != null) && !("".equals(encoding))) {
         metadata.put("OriginalCharEncoding", encoding);
         if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
           metadata.put("CharEncodingForConversion", encoding);
@@ -129,7 +129,7 @@
       }
 
       // sniff out 'charset' value from the beginning of a document
-      if (encoding == null) {
+      if ((encoding == null) || ("".equals(encoding))) {
         encoding = sniffCharacterEncoding(contentInOctets);
         if (encoding!=null) {
           metadata.put("OriginalCharEncoding", encoding);