You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/11/01 19:44:27 UTC

svn commit: r1636010 - in /nutch/branches/2.x: CHANGES.txt conf/schema.xml src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java

Author: lewismc
Date: Sat Nov  1 18:44:26 2014
New Revision: 1636010

URL: http://svn.apache.org/r1636010
Log:
remove field orig which duplicates 'id'

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/schema.xml
    nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov  1 18:44:26 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1820 remove field "orig" which duplicates "id" (lewismc, snagel)
+
 * NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib)
 
 * NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel)

Modified: nutch/branches/2.x/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Sat Nov  1 18:44:26 2014
@@ -317,7 +317,6 @@
     <!-- fields for index-basic plugin -->
     <field name="host" type="url" stored="false" indexed="true"/>
     <field name="url" type="url" stored="true" indexed="true"/>
-    <field name="orig" type="url" stored="true" indexed="true" />
     <!-- stored=true for highlighting, use term vectors  and positions for fast highlighting -->
     <field name="content" type="text_general" stored="true" indexed="true"/>
     <field name="title" type="text_general" stored="true" indexed="true" multiValued="true"/>

Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Sat Nov  1 18:44:26 2014
@@ -40,7 +40,6 @@ import java.util.HashSet;
  * host - add host as un-stored, indexed and tokenized
  * url - url is both stored and indexed, so it's both searchable and returned. 
  * This is also a required field.
- * orig - also store original url as both stored and indexed
  * content - content is indexed, so that it's searchable, but not stored in index
  * title - title is stored and indexed
  * cache - add cached content/summary display policy, if available
@@ -99,11 +98,6 @@ public class BasicIndexingFilter impleme
     // url is both stored and indexed, so it's both searchable and returned
     doc.add("url", reprUrl == null ? url : reprUrl);
 
-    if (reprUrl != null) {
-      // also store original url as both stored and indexed
-      doc.add("orig", url);
-    }
-
     // content is indexed, so that it's searchable, but not stored in index
     doc.add("content", TableUtil.toString(page.getText()));
 

Modified: nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Sat Nov  1 18:44:26 2014
@@ -30,7 +30,7 @@ import static org.junit.Assert.*;
 
 /**
  * JUnit test case which tests
- * 1. that the host, url, orig, content, title, cache and tstamp fields 
+ * 1. that the host, url, content, title, cache and tstamp fields 
  * are obtained by the filter.
  * 2. that configurable maximum length functionality for titles actually works. .
  * This property defaults at 100 characters @see {@code indexer.max.title.length} 
@@ -65,7 +65,6 @@ public class TestBasicIndexingFilter {
 	assertNotNull(doc);
 	assertTrue("check for host field ", doc.getFieldNames().contains("host"));
 	assertTrue("check for url field", doc.getFieldNames().contains("url"));
-	assertTrue("check for orig field", doc.getFieldNames().contains("orig"));
 	assertTrue("check for content field", doc.getFieldNames().contains("content"));
 	assertTrue("check for title field", doc.getFieldNames().contains("title"));
 	assertTrue("check for cache field", doc.getFieldNames().contains("cache"));