You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/07/29 17:13:21 UTC
svn commit: r1614375 - in /nutch: branches/2.x/ branches/2.x/conf/
branches/2.x/src/java/org/apache/nutch/indexer/
branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/
trunk/ trunk/conf/ trunk/src/java/org/apache/nutch...
Author: snagel
Date: Tue Jul 29 15:13:20 2014
New Revision: 1614375
URL: http://svn.apache.org/r1614375
Log:
NUTCH-1708 use same id when indexing and deleting redirects
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/schema.xml
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/schema-solr4.xml
nutch/trunk/conf/schema.xml
nutch/trunk/conf/solrindex-mapping.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Jul 29 15:13:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1708 use same id when indexing and deleting redirects (snagel)
+
* NUTCH-1817 Remove pom.xml from source (jnioche)
* NUTCH-1811 bin/nutch junit to use junit 4 test runner (snagel)
Modified: nutch/branches/2.x/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Tue Jul 29 15:13:20 2014
@@ -307,7 +307,7 @@
to include it as performance improvements are minimal. -->
<field name="_version_" type="long" indexed="true" stored="true"/>
- <field name="id" type="string" stored="true" indexed="true"/>
+ <field name="id" type="string" stored="true" indexed="true" required="true"/>
<!-- core fields -->
<field name="batchId" type="string" stored="true" indexed="false"/>
@@ -316,7 +316,7 @@
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="false" indexed="true"/>
- <field name="url" type="url" stored="true" indexed="true" required="true"/>
+ <field name="url" type="url" stored="true" indexed="true"/>
<field name="orig" type="url" stored="true" indexed="true" />
<!-- stored=true for highlighting, use term vectors and positions for fast highlighting -->
<field name="content" type="text_general" stored="true" indexed="true"/>
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Tue Jul 29 15:13:20 2014
@@ -123,6 +123,7 @@ public class IndexingFiltersChecker exte
}
NutchDocument doc = new NutchDocument();
+ doc.add("id", url);
doc.add("digest", StringUtil.toHexString(page.getSignature()));
try {
Modified: nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Tue Jul 29 15:13:20 2014
@@ -121,7 +121,7 @@ public class ElasticIndexWriter implemen
@Override
public void write(NutchDocument doc) throws IOException {
- String id = (String) doc.getFieldValue("url");
+ String id = (String) doc.getFieldValue("id");
String type = doc.getDocumentMeta().get("type");
if (type == null)
type = "doc";
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 29 15:13:20 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1708 use same id when indexing and deleting redirects (snagel)
+
* NUTCH-1818 Add deps-test-compile task for building plugins (jnioche)
* NUTCH-1817 Remove pom.xml from source (jnioche)
Modified: nutch/trunk/conf/schema-solr4.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Tue Jul 29 15:13:20 2014
@@ -301,7 +301,7 @@
</types>
<fields>
- <field name="id" type="string" stored="true" indexed="true"/>
+ <field name="id" type="string" stored="true" indexed="true" required="true"/>
<!-- core fields -->
<field name="segment" type="string" stored="true" indexed="false"/>
@@ -310,7 +310,7 @@
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="false" indexed="true"/>
- <field name="url" type="url" stored="true" indexed="true" required="true"/>
+ <field name="url" type="url" stored="true" indexed="true"/>
<!-- stored=true for highlighting, use term vectors and positions for fast highlighting -->
<field name="content" type="text_general" stored="true" indexed="true"/>
<field name="title" type="text_general" stored="true" indexed="true"/>
Modified: nutch/trunk/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Tue Jul 29 15:13:20 2014
@@ -66,7 +66,8 @@
</fieldType>
</types>
<fields>
- <field name="id" type="string" stored="true" indexed="true"/>
+ <field name="id" type="string" stored="true" indexed="true"
+ required="true"/>
<!-- core fields -->
<field name="segment" type="string" stored="true" indexed="false"/>
@@ -75,8 +76,7 @@
<!-- fields for index-basic plugin -->
<field name="host" type="string" stored="false" indexed="true"/>
- <field name="url" type="url" stored="true" indexed="true"
- required="true"/>
+ <field name="url" type="url" stored="true" indexed="true"/>
<field name="content" type="text" stored="false" indexed="true"/>
<field name="title" type="text" stored="true" indexed="true"/>
<field name="cache" type="string" stored="true" indexed="false"/>
Modified: nutch/trunk/conf/solrindex-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/solrindex-mapping.xml?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/conf/solrindex-mapping.xml (original)
+++ nutch/trunk/conf/solrindex-mapping.xml Tue Jul 29 15:13:20 2014
@@ -38,8 +38,6 @@
<field dest="boost" source="boost"/>
<field dest="digest" source="digest"/>
<field dest="tstamp" source="tstamp"/>
- <field dest="id" source="url"/>
- <copyField source="url" dest="url"/>
</fields>
<uniqueKey>id</uniqueKey>
</mapping>
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Jul 29 15:13:20 2014
@@ -262,6 +262,8 @@ implements Mapper<Text, Writable, Text,
}
NutchDocument doc = new NutchDocument();
+ doc.add("id", key.toString());
+
final Metadata metadata = parseData.getContentMeta();
// add segment, used to map from merged index back to segment files
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Tue Jul 29 15:13:20 2014
@@ -130,6 +130,7 @@ public class IndexingFiltersChecker exte
ParseResult parseResult = new ParseUtil(conf).parse(content);
NutchDocument doc = new NutchDocument();
+ doc.add("id", url);
Text urlText = new Text(url);
Inlinks inlinks = null;
Modified: nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java (original)
+++ nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java Tue Jul 29 15:13:20 2014
@@ -53,12 +53,12 @@ public class DummyIndexWriter implements
@Override
public void update(NutchDocument doc) throws IOException {
- writer.write("update\t" + doc.getFieldValue("url") + "\n");
+ writer.write("update\t" + doc.getFieldValue("id") + "\n");
}
@Override
public void write(NutchDocument doc) throws IOException {
- writer.write("add\t" + doc.getFieldValue("url") + "\n");
+ writer.write("add\t" + doc.getFieldValue("id") + "\n");
}
public void close() throws IOException {
Modified: nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java?rev=1614375&r1=1614374&r2=1614375&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java (original)
+++ nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java Tue Jul 29 15:13:20 2014
@@ -123,7 +123,7 @@ public class ElasticIndexWriter implemen
@Override
public void write(NutchDocument doc) throws IOException {
- String id = (String) doc.getFieldValue("url");
+ String id = (String) doc.getFieldValue("id");
String type = doc.getDocumentMeta().get("type");
if (type == null)
type = "doc";