You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2015/12/08 22:45:47 UTC

svn commit: r1718718 - in /nutch: branches/2.x/CHANGES.txt branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java trunk/CHANGES.txt trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Author: snagel
Date: Tue Dec  8 21:45:47 2015
New Revision: 1718718

URL: http://svn.apache.org/viewvc?rev=1718718&view=rev
Log:
NUTCH-2042 parse-html increase chunk size used to detect charset

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Dec  8 21:45:47 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 2.3.1 Release 22092015 (ddmmyyyy)
 Release Report - http://s.apache.org/nutch_2.3.1
 
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
 * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
 
 * NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban Drulea, lewismc, snagel)

Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Dec  8 21:45:47 2015
@@ -27,6 +27,7 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -67,7 +68,8 @@ public class HtmlParser implements Parse
   // I used 1000 bytes at first, but found that some documents have
   // meta tag well past the first 1000 bytes.
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
-  private static final int CHUNK_SIZE = 2000;
+  // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+  private static final int CHUNK_SIZE = 8192;
 
   // NUTCH-1006 Meta equiv with single quotes not accepted
   private static Pattern metaPattern = Pattern.compile(
@@ -111,14 +113,8 @@ public class HtmlParser implements Parse
     // to just inflate each byte to a 16-bit value by padding.
     // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
     // {U+0041, U+0082, U+00B7}.
-    String str = "";
-    try {
-      str = new String(content.array(), content.arrayOffset()
-          + content.position(), length, Charset.forName("ASCII").toString());
-    } catch (UnsupportedEncodingException e) {
-      // code should never come here, but just in case...
-      return null;
-    }
+    String str = new String(content.array(), content.arrayOffset()
+        + content.position(), length, StandardCharsets.US_ASCII);
 
     Matcher metaMatcher = metaPattern.matcher(str);
     String encoding = null;

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Dec  8 21:45:47 2015
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
 * NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola Tonellotto, snagel)
 
 Nutch 1.11 Release 03/12/2015 (dd/mm/yyyy)

Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Dec  8 21:45:47 2015
@@ -21,7 +21,7 @@ import java.util.ArrayList;
 import java.util.Map;
 import java.net.URL;
 import java.net.MalformedURLException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.io.*;
 import java.util.regex.*;
 
@@ -30,10 +30,8 @@ import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.w3c.dom.*;
 import org.apache.html.dom.*;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.Content;
@@ -48,7 +46,8 @@ public class HtmlParser implements Parse
   // I used 1000 bytes at first, but found that some documents have
   // meta tag well past the first 1000 bytes.
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
-  private static final int CHUNK_SIZE = 2000;
+  // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+  private static final int CHUNK_SIZE = 8192;
 
   // NUTCH-1006 Meta equiv with single quotes not accepted
   private static Pattern metaPattern = Pattern.compile(
@@ -86,13 +85,7 @@ public class HtmlParser implements Parse
     // to just inflate each byte to a 16-bit value by padding.
     // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
     // {U+0041, U+0082, U+00B7}.
-    String str = "";
-    try {
-      str = new String(content, 0, length, Charset.forName("ASCII").toString());
-    } catch (UnsupportedEncodingException e) {
-      // code should never come here, but just in case...
-      return null;
-    }
+    String str = new String(content, 0, length, StandardCharsets.US_ASCII);
 
     Matcher metaMatcher = metaPattern.matcher(str);
     String encoding = null;