You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/11/01 19:44:27 UTC
svn commit: r1636010 - in /nutch/branches/2.x: CHANGES.txt conf/schema.xml
src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
Author: lewismc
Date: Sat Nov 1 18:44:26 2014
New Revision: 1636010
URL: http://svn.apache.org/r1636010
Log:
remove field orig which duplicates 'id'
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/schema.xml
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov 1 18:44:26 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.3-SNAPSHOT
+* NUTCH-1820 remove field "orig" which duplicates "id" (lewismc, snagel)
+
* NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib)
* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel)
Modified: nutch/branches/2.x/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Sat Nov 1 18:44:26 2014
@@ -317,7 +317,6 @@
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="false" indexed="true"/>
<field name="url" type="url" stored="true" indexed="true"/>
- <field name="orig" type="url" stored="true" indexed="true" />
<!-- stored=true for highlighting, use term vectors and positions for fast highlighting -->
<field name="content" type="text_general" stored="true" indexed="true"/>
<field name="title" type="text_general" stored="true" indexed="true" multiValued="true"/>
Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Sat Nov 1 18:44:26 2014
@@ -40,7 +40,6 @@ import java.util.HashSet;
* host - add host as un-stored, indexed and tokenized
* url - url is both stored and indexed, so it's both searchable and returned.
* This is also a required field.
- * orig - also store original url as both stored and indexed
* content - content is indexed, so that it's searchable, but not stored in index
* title - title is stored and indexed
* cache - add cached content/summary display policy, if available
@@ -99,11 +98,6 @@ public class BasicIndexingFilter impleme
// url is both stored and indexed, so it's both searchable and returned
doc.add("url", reprUrl == null ? url : reprUrl);
- if (reprUrl != null) {
- // also store original url as both stored and indexed
- doc.add("orig", url);
- }
-
// content is indexed, so that it's searchable, but not stored in index
doc.add("content", TableUtil.toString(page.getText()));
Modified: nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1636010&r1=1636009&r2=1636010&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Sat Nov 1 18:44:26 2014
@@ -30,7 +30,7 @@ import static org.junit.Assert.*;
/**
* JUnit test case which tests
- * 1. that the host, url, orig, content, title, cache and tstamp fields
+ * 1. that the host, url, content, title, cache and tstamp fields
* are obtained by the filter.
* 2. that configurable maximum length functionality for titles actually works. .
* This property defaults at 100 characters @see {@code indexer.max.title.length}
@@ -65,7 +65,6 @@ public class TestBasicIndexingFilter {
assertNotNull(doc);
assertTrue("check for host field ", doc.getFieldNames().contains("host"));
assertTrue("check for url field", doc.getFieldNames().contains("url"));
- assertTrue("check for orig field", doc.getFieldNames().contains("orig"));
assertTrue("check for content field", doc.getFieldNames().contains("content"));
assertTrue("check for title field", doc.getFieldNames().contains("title"));
assertTrue("check for cache field", doc.getFieldNames().contains("cache"));