You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2013/02/22 06:05:25 UTC

svn commit: r1448897 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/util/StringUtil.java src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Author: lewismc
Date: Fri Feb 22 05:05:24 2013
New Revision: 1448897

URL: http://svn.apache.org/r1448897
Log:
revert previous commit

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
    nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1448897&r1=1448896&r2=1448897&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Feb 22 05:05:24 2013
@@ -2,8 +2,6 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
-* NUTCH-1420 Get rid of the dreaded � (markus via lewismc)
-
 * NUTCH-XX remove unused db.max.inlinks property in nutch-default.xml (lewismc)
 
 * NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas Patil)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java?rev=1448897&r1=1448896&r2=1448897&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java Fri Feb 22 05:05:24 2013
@@ -125,13 +125,6 @@ public class StringUtil {
   public static boolean isEmpty(String str) {
     return (str == null) || (str.equals(""));
   }
-  
-  /**
-   * Simple character substitution which cleans all � chars from a given String.
-   */
-  public static String cleanField(String value) {
-    return value.replaceAll("�", "");
-  }
 
   public static void main(String[] args) {
     if (args.length != 1)

Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1448897&r1=1448896&r2=1448897&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Fri Feb 22 05:05:24 2013
@@ -33,7 +33,6 @@ import org.apache.nutch.indexer.NutchDoc
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.Bytes;
-import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.TableUtil;
 import org.apache.solr.common.util.DateUtil;
 
@@ -106,8 +105,7 @@ public class BasicIndexingFilter impleme
     }
 
     // content is indexed, so that it's searchable, but not stored in index
-    String content = TableUtil.toString(page.getText());
-    doc.add("content", StringUtil.cleanField(content));
+    doc.add("content", TableUtil.toString(page.getText()));
 
     // title
     String title = TableUtil.toString(page.getTitle());
@@ -116,7 +114,7 @@ public class BasicIndexingFilter impleme
     }
     if (title.length() > 0) {
       // NUTCH-1004 Do not index empty values for title field
-      doc.add("title", StringUtil.cleanField(title));
+      doc.add("title", title);
     }
     // add cached content/summary display policy, if available
     ByteBuffer cachingRaw = page