You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/05/15 20:29:50 UTC

svn commit: r538273 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

Author: siren
Date: Tue May 15 11:29:49 2007
New Revision: 538273

URL: http://svn.apache.org/viewvc?view=rev&rev=538273
Log:
NUTCH-161 Change Plain text parser to use parser.character.encoding.default property for fall back encoding
spotted by KuroSaka TeruHiko

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=538273&r1=538272&r2=538273
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue May 15 11:29:49 2007
@@ -19,6 +19,10 @@
  
  7. NUTCH-483 - Remove redundant commons-logging jar from ontology plugin
     (siren)
+    
+ 8. NUTCH-161 - Change Plain text parser to
+    use parser.character.encoding.default property for fall back encoding
+    (KuroSaka TeruHiko, siren)
   
 
 Release 0.9 - 2007-04-02

Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=538273&r1=538272&r2=538273
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Tue May 15 11:29:49 2007
@@ -24,35 +24,42 @@
 import org.apache.hadoop.conf.Configuration;
 
 public class TextParser implements Parser {
+
   private Configuration conf;
+  
+  /**
+   * Encoding to be used when character set isn't specified
+   * as HTTP header.
+   */
+  private String defaultEncoding;
 
+  /**
+   * Parses plain text document. This code uses configured default encoding
+   * {@code parser.character.encoding.default} if character set isn't specified
+   * as HTTP header. FIXME: implement charset detector
+   */
   public ParseResult getParse(Content content) {
 
-    // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
-    // Outlink[0], metadata);
-
     String encoding = StringUtil.parseCharacterEncoding(content
         .getContentType());
     String text;
-    if (encoding != null) { // found an encoding header
-      try { // try to use named encoding
-        text = new String(content.getContent(), encoding);
-      } catch (java.io.UnsupportedEncodingException e) {
-        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
-      }
-    } else {
-      // FIXME: implement charset detector. This code causes problem when
-      // character set isn't specified in HTTP header.
-      text = new String(content.getContent()); // use default encoding
+    try {
+      text = new String(content.getContent(), encoding != null ? encoding
+          : defaultEncoding);
+    } catch (java.io.UnsupportedEncodingException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
     }
+    
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
         OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata());
     parseData.setConf(this.conf);
     return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
-    
   }
 
   public void setConf(Configuration conf) {
+    defaultEncoding = conf.get("parser.character.encoding.default",
+        "windows-1252");
     this.conf = conf;
   }