You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/31 11:51:55 UTC
svn commit: r1065552 - in /nutch/branches/branch-1.3: CHANGES.txt
src/plugin/parse-ext/plugin.xml
src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
Author: jnioche
Date: Mon Jan 31 10:51:55 2011
New Revision: 1065552
URL: http://svn.apache.org/viewvc?rev=1065552&view=rev
Log:
NUTCH-951 : backport changes from 2.0 into 1.3 : NUTCH-564 done
Modified:
nutch/branches/branch-1.3/CHANGES.txt
nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml
nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1065552&r1=1065551&r2=1065552&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Mon Jan 31 10:51:55 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.3 - Current Development
+* NUTCH-564 External parser supports encoding attribute (Antony Bowesman, mattmann)
+
* NUTCH-964 Upgraded Xerces to 2.91, ERROR conf.Configuration - Failed to set setXIncludeAware (markus)
* NUTCH-927 Fetcher.timelimit.mins is invalid when depth is greater than 1 (Wade Lau via jnioche)
Modified: nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml?rev=1065552&r1=1065551&r2=1065552&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/parse-ext/plugin.xml Mon Jan 31 10:51:55 2011
@@ -41,6 +41,8 @@
<parameter name="pathSuffix" value=""/>
<parameter name="command" value="./build/plugins/parse-ext/command"/>
<parameter name="timeout" value="10"/>
+ <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+ <!-- <parameter name="encoding" value="UTF-8"/> -->
</implementation>
<implementation id="ExtParser"
@@ -49,6 +51,8 @@
<parameter name="pathSuffix" value=""/>
<parameter name="command" value="./build/plugins/parse-ext/command"/>
<parameter name="timeout" value="20"/>
+ <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+ <!-- <parameter name="encoding" value="UTF-8"/> -->
</implementation>
</extension>
Modified: nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=1065552&r1=1065551&r2=1065552&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ nutch/branches/branch-1.3/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Mon Jan 31 10:51:55 2011
@@ -42,6 +42,7 @@ import java.util.Hashtable;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
/**
* A wrapper that invokes external command to do real parsing job.
@@ -57,7 +58,7 @@ public class ExtParser implements Parser
static final int TIMEOUT_DEFAULT = 30; // in seconds
- // handy map from String contentType to String[] {command, timeoutString}
+ // handy map from String contentType to String[] {command, timeoutString, encoding}
Hashtable TYPE_PARAMS_MAP = new Hashtable();
private Configuration conf;
@@ -77,6 +78,7 @@ public class ExtParser implements Parser
String command = params[0];
int timeout = Integer.parseInt(params[1]);
+ String encoding = params[2];
if (LOG.isTraceEnabled()) {
LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
@@ -117,7 +119,7 @@ public class ExtParser implements Parser
"External command " + command
+ " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
- text = os.toString();
+ text = os.toString(encoding);
} catch (Exception e) { // run time exception
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
@@ -143,7 +145,7 @@ public class ExtParser implements Parser
Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
"org.apache.nutch.parse.Parser").getExtensions();
- String contentType, command, timeoutString;
+ String contentType, command, timeoutString, encoding;
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
@@ -160,11 +162,16 @@ public class ExtParser implements Parser
if (command == null || command.equals(""))
continue;
+ // null encoding means default
+ encoding = extension.getAttribute("encoding");
+ if (encoding == null)
+ encoding = Charset.defaultCharset().name();
+
timeoutString = extension.getAttribute("timeout");
if (timeoutString == null || timeoutString.equals(""))
timeoutString = "" + TIMEOUT_DEFAULT;
- TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString });
+ TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, encoding });
}
}