You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/05/15 20:29:50 UTC
svn commit: r538273 - in /lucene/nutch/trunk: CHANGES.txt
src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
Author: siren
Date: Tue May 15 11:29:49 2007
New Revision: 538273
URL: http://svn.apache.org/viewvc?view=rev&rev=538273
Log:
NUTCH-161 Change Plain text parser to use parser.character.encoding.default property for fall back encoding
spotted by KuroSaka TeruHiko
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=538273&r1=538272&r2=538273
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue May 15 11:29:49 2007
@@ -19,6 +19,10 @@
7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin
(siren)
+
+ 8. NUTCH-161 - Change Plain text parser to
+ use parser.character.encoding.default property for fall back encoding
+ (KuroSaka TeruHiko, siren)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=538273&r1=538272&r2=538273
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Tue May 15 11:29:49 2007
@@ -24,35 +24,42 @@
import org.apache.hadoop.conf.Configuration;
public class TextParser implements Parser {
+
private Configuration conf;
+
+ /**
+ * Encoding to be used when character set isn't specified
+ * as HTTP header.
+ */
+ private String defaultEncoding;
+ /**
+ * Parses plain text document. This code uses configured default encoding
+ * {@code parser.character.encoding.default} if character set isn't specified
+ * as HTTP header. FIXME: implement charset detector
+ */
public ParseResult getParse(Content content) {
- // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
- // Outlink[0], metadata);
-
String encoding = StringUtil.parseCharacterEncoding(content
.getContentType());
String text;
- if (encoding != null) { // found an encoding header
- try { // try to use named encoding
- text = new String(content.getContent(), encoding);
- } catch (java.io.UnsupportedEncodingException e) {
- return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
- }
- } else {
- // FIXME: implement charset detector. This code causes problem when
- // character set isn't specified in HTTP header.
- text = new String(content.getContent()); // use default encoding
+ try {
+ text = new String(content.getContent(), encoding != null ? encoding
+ : defaultEncoding);
+ } catch (java.io.UnsupportedEncodingException e) {
+ return new ParseStatus(e)
+ .getEmptyParseResult(content.getUrl(), getConf());
}
+
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata());
parseData.setConf(this.conf);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
-
}
public void setConf(Configuration conf) {
+ defaultEncoding = conf.get("parser.character.encoding.default",
+ "windows-1252");
this.conf = conf;
}