You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2015/12/08 22:45:47 UTC
svn commit: r1718718 - in /nutch: branches/2.x/CHANGES.txt
branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
trunk/CHANGES.txt
trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Author: snagel
Date: Tue Dec 8 21:45:47 2015
New Revision: 1718718
URL: http://svn.apache.org/viewvc?rev=1718718&view=rev
Log:
NUTCH-2042 parse-html increase chunk size used to detect charset
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Dec 8 21:45:47 2015
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 2.3.1 Release 22092015 (ddmmyyyy)
Release Report - http://s.apache.org/nutch_2.3.1
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
* NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban Drulea, lewismc, snagel)
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Dec 8 21:45:47 2015
@@ -27,6 +27,7 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -67,7 +68,8 @@ public class HtmlParser implements Parse
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
- private static final int CHUNK_SIZE = 2000;
+ // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+ private static final int CHUNK_SIZE = 8192;
// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
@@ -111,14 +113,8 @@ public class HtmlParser implements Parse
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
- String str = "";
- try {
- str = new String(content.array(), content.arrayOffset()
- + content.position(), length, Charset.forName("ASCII").toString());
- } catch (UnsupportedEncodingException e) {
- // code should never come here, but just in case...
- return null;
- }
+ String str = new String(content.array(), content.arrayOffset()
+ + content.position(), length, StandardCharsets.US_ASCII);
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Dec 8 21:45:47 2015
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
* NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola Tonellotto, snagel)
Nutch 1.11 Release 03/12/2015 (dd/mm/yyyy)
Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1718718&r1=1718717&r2=1718718&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Tue Dec 8 21:45:47 2015
@@ -21,7 +21,7 @@ import java.util.ArrayList;
import java.util.Map;
import java.net.URL;
import java.net.MalformedURLException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.io.*;
import java.util.regex.*;
@@ -30,10 +30,8 @@ import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
import org.apache.html.dom.*;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
@@ -48,7 +46,8 @@ public class HtmlParser implements Parse
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
- private static final int CHUNK_SIZE = 2000;
+ // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+ private static final int CHUNK_SIZE = 8192;
// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
@@ -86,13 +85,7 @@ public class HtmlParser implements Parse
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
- String str = "";
- try {
- str = new String(content, 0, length, Charset.forName("ASCII").toString());
- } catch (UnsupportedEncodingException e) {
- // code should never come here, but just in case...
- return null;
- }
+ String str = new String(content, 0, length, StandardCharsets.US_ASCII);
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;