You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/31 11:51:55 UTC

svn commit: r1065552 - in /nutch/branches/branch-1.3: CHANGES.txt src/plugin/parse-ext/plugin.xml src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java

Author: jnioche
Date: Mon Jan 31 10:51:55 2011
New Revision: 1065552

URL: http://svn.apache.org/viewvc?rev=1065552&view=rev
Log:
NUTCH-951 : backport changes from 2.0 into 1.3 : NUTCH-564 done

Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml
    nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1065552&r1=1065551&r2=1065552&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Mon Jan 31 10:51:55 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - Current Development
 
+* NUTCH-564 External parser supports encoding attribute (Antony Bowesman, mattmann)
+
 * NUTCH-964 Upgraded Xerces to 2.91, ERROR conf.Configuration - Failed to set setXIncludeAware (markus)
 
 * NUTCH-927 Fetcher.timelimit.mins is invalid when depth is greater than 1 (Wade Lau via jnioche)

Modified: nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml?rev=1065552&r1=1065551&r2=1065552&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml Mon Jan 31 10:51:55 2011
@@ -41,6 +41,8 @@
         <parameter name="pathSuffix"  value=""/>
         <parameter name="command"     value="./build/plugins/parse-ext/command"/>
         <parameter name="timeout"     value="10"/>
+        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+        <!-- <parameter name="encoding" value="UTF-8"/> -->
       </implementation>
 
       <implementation id="ExtParser"
@@ -49,6 +51,8 @@
         <parameter name="pathSuffix"  value=""/>
         <parameter name="command"     value="./build/plugins/parse-ext/command"/>
         <parameter name="timeout"     value="20"/>
+        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+        <!-- <parameter name="encoding" value="UTF-8"/> -->
       </implementation>
 
    </extension>

Modified: nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=1065552&r1=1065551&r2=1065552&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Mon Jan 31 10:51:55 2011
@@ -42,6 +42,7 @@ import java.util.Hashtable;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
 
 /**
  * A wrapper that invokes external command to do real parsing job.
@@ -57,7 +58,7 @@ public class ExtParser implements Parser
 
   static final int TIMEOUT_DEFAULT = 30; // in seconds
 
-  // handy map from String contentType to String[] {command, timeoutString}
+  // handy map from String contentType to String[] {command, timeoutString, encoding}
   Hashtable TYPE_PARAMS_MAP = new Hashtable();
 
   private Configuration conf;  
@@ -77,6 +78,7 @@ public class ExtParser implements Parser
 
     String command = params[0];
     int timeout = Integer.parseInt(params[1]);
+    String encoding = params[2];
 
     if (LOG.isTraceEnabled()) {
       LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
@@ -117,7 +119,7 @@ public class ExtParser implements Parser
                         "External command " + command
                         + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
 
-      text = os.toString();
+      text = os.toString(encoding);
 
     } catch (Exception e) { // run time exception
       return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
@@ -143,7 +145,7 @@ public class ExtParser implements Parser
     Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
         "org.apache.nutch.parse.Parser").getExtensions();
 
-    String contentType, command, timeoutString;
+    String contentType, command, timeoutString, encoding;
 
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
@@ -160,11 +162,16 @@ public class ExtParser implements Parser
       if (command == null || command.equals(""))
         continue;
 
+      // null encoding means default
+      encoding = extension.getAttribute("encoding");
+      if (encoding == null)
+          encoding = Charset.defaultCharset().name();
+
       timeoutString = extension.getAttribute("timeout");
       if (timeoutString == null || timeoutString.equals(""))
         timeoutString = "" + TIMEOUT_DEFAULT;
 
-      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString });
+      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, encoding });
     }
   }