You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/07/29 17:13:21 UTC

svn commit: r1614375 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/java/org/apache/nutch/indexer/ branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ trunk/ trunk/conf/ trunk/src/java/org/apache/nutch...

Author: snagel
Date: Tue Jul 29 15:13:20 2014
New Revision: 1614375

URL: http://svn.apache.org/r1614375
Log:
NUTCH-1708 use same id when indexing and deleting redirects

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/schema.xml
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/schema-solr4.xml
    nutch/trunk/conf/schema.xml
    nutch/trunk/conf/solrindex-mapping.xml
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
    nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Jul 29 15:13:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1708 use same id when indexing and deleting redirects (snagel)
+
 * NUTCH-1817 Remove pom.xml from source (jnioche)
 
 * NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel)

Modified: nutch/branches/2.x/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Tue Jul 29 15:13:20 2014
@@ -307,7 +307,7 @@
     to include it as performance improvements are minimal. -->
     <field name="_version_" type="long" indexed="true" stored="true"/>
     
-    <field name="id" type="string" stored="true" indexed="true"/>
+    <field name="id" type="string" stored="true" indexed="true" required="true"/>
 
     <!-- core fields -->
     <field name="batchId" type="string" stored="true" indexed="false"/>
@@ -316,7 +316,7 @@
 
     <!-- fields for index-basic plugin -->
     <field name="host" type="url" stored="false" indexed="true"/>
-    <field name="url" type="url" stored="true" indexed="true" required="true"/>
+    <field name="url" type="url" stored="true" indexed="true"/>
     <field name="orig" type="url" stored="true" indexed="true" />
     <!-- stored=true for highlighting, use term vectors  and positions for fast highlighting -->
     <field name="content" type="text_general" stored="true" indexed="true"/>

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Tue Jul 29 15:13:20 2014
@@ -123,6 +123,7 @@ public class IndexingFiltersChecker exte
     }
 
     NutchDocument doc = new NutchDocument();
+    doc.add("id", url);
     doc.add("digest", StringUtil.toHexString(page.getSignature()));
 
     try {

Modified: nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Tue Jul 29 15:13:20 2014
@@ -121,7 +121,7 @@ public class ElasticIndexWriter implemen
 
   @Override
   public void write(NutchDocument doc) throws IOException {
-    String id = (String) doc.getFieldValue("url");
+    String id = (String) doc.getFieldValue("id");
     String type = doc.getDocumentMeta().get("type");
     if (type == null)
       type = "doc";

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 29 15:13:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1708 use same id when indexing and deleting redirects (snagel)
+
 * NUTCH-1818 Add deps-test-compile task for building plugins (jnioche)
 
 * NUTCH-1817 Remove pom.xml from source (jnioche)

Modified: nutch/trunk/conf/schema-solr4.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Tue Jul 29 15:13:20 2014
@@ -301,7 +301,7 @@
  </types>
 
  <fields>
-    <field name="id" type="string" stored="true" indexed="true"/>
+    <field name="id" type="string" stored="true" indexed="true" required="true"/>
 
     <!-- core fields -->
     <field name="segment" type="string" stored="true" indexed="false"/>
@@ -310,7 +310,7 @@
 
     <!-- fields for index-basic plugin -->
     <field name="host" type="url" stored="false" indexed="true"/>
-    <field name="url" type="url" stored="true" indexed="true" required="true"/>
+    <field name="url" type="url" stored="true" indexed="true"/>
     <!-- stored=true for highlighting, use term vectors  and positions for fast highlighting -->
     <field name="content" type="text_general" stored="true" indexed="true"/>
     <field name="title" type="text_general" stored="true" indexed="true"/>

Modified: nutch/trunk/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Tue Jul 29 15:13:20 2014
@@ -66,7 +66,8 @@
         </fieldType>
     </types>
     <fields>
-        <field name="id" type="string" stored="true" indexed="true"/>
+        <field name="id" type="string" stored="true" indexed="true"
+            required="true"/>
 
         <!-- core fields -->
         <field name="segment" type="string" stored="true" indexed="false"/>
@@ -75,8 +76,7 @@
 
         <!-- fields for index-basic plugin -->
         <field name="host" type="string" stored="false" indexed="true"/>
-        <field name="url" type="url" stored="true" indexed="true"
-            required="true"/>
+        <field name="url" type="url" stored="true" indexed="true"/>
         <field name="content" type="text" stored="false" indexed="true"/>
         <field name="title" type="text" stored="true" indexed="true"/>
         <field name="cache" type="string" stored="true" indexed="false"/>

Modified: nutch/trunk/conf/solrindex-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/solrindex-mapping.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/conf/solrindex-mapping.xml (original)
+++ nutch/trunk/conf/solrindex-mapping.xml Tue Jul 29 15:13:20 2014
@@ -38,8 +38,6 @@
 		<field dest="boost" source="boost"/>
 		<field dest="digest" source="digest"/>
 		<field dest="tstamp" source="tstamp"/>
-		<field dest="id" source="url"/>
-		<copyField source="url" dest="url"/>
 	</fields>
 	<uniqueKey>id</uniqueKey>
 </mapping>

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Jul 29 15:13:20 2014
@@ -262,6 +262,8 @@ implements Mapper<Text, Writable, Text, 
     }
 
     NutchDocument doc = new NutchDocument();
+    doc.add("id", key.toString());
+
     final Metadata metadata = parseData.getContentMeta();
 
     // add segment, used to map from merged index back to segment files

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Tue Jul 29 15:13:20 2014
@@ -130,6 +130,7 @@ public class IndexingFiltersChecker exte
     ParseResult parseResult = new ParseUtil(conf).parse(content);
 
     NutchDocument doc = new NutchDocument();
+    doc.add("id", url);
     Text urlText = new Text(url);
 
     Inlinks inlinks = null;

Modified: nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java (original)
+++ nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java Tue Jul 29 15:13:20 2014
@@ -53,12 +53,12 @@ public class DummyIndexWriter implements
 
   @Override
   public void update(NutchDocument doc) throws IOException {
-    writer.write("update\t" + doc.getFieldValue("url") + "\n");
+    writer.write("update\t" + doc.getFieldValue("id") + "\n");
   }
 
   @Override
   public void write(NutchDocument doc) throws IOException {
-    writer.write("add\t" + doc.getFieldValue("url") + "\n");
+    writer.write("add\t" + doc.getFieldValue("id") + "\n");
   }
 
   public void close() throws IOException {

Modified: nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java (original)
+++ nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Tue Jul 29 15:13:20 2014
@@ -123,7 +123,7 @@ public class ElasticIndexWriter implemen
 
   @Override
   public void write(NutchDocument doc) throws IOException {
-    String id = (String) doc.getFieldValue("url");
+    String id = (String) doc.getFieldValue("id");
     String type = doc.getDocumentMeta().get("type");
     if (type == null)
       type = "doc";