You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/15 13:01:47 UTC
svn commit: r1558349 [1/2] - in /nutch/branches/2.x: ./ conf/ ivy/ src/bin/
src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/indexer/
src/java/org/apache/nutch/indexer/elastic/
src/java/org/apache/nutch/indexer/solr/ src/plugin/ src/plugin...
Author: lewismc
Date: Wed Jan 15 12:01:45 2014
New Revision: 1558349
URL: http://svn.apache.org/r1558349
Log:
NUTCH-1568 port pluggable indexing architecture to 2.x
Added:
nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
nutch/branches/2.x/src/plugin/indexer-solr/
nutch/branches/2.x/src/plugin/indexer-solr/build.xml
nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml
nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml
nutch/branches/2.x/src/plugin/indexer-solr/src/
nutch/branches/2.x/src/plugin/indexer-solr/src/java/
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
Removed:
nutch/branches/2.x/conf/schema-solr4.xml
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexCleanerJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriterFactory.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/elastic/ElasticConstants.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/elastic/ElasticIndexerJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrClean.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/build.xml
nutch/branches/2.x/conf/log4j.properties
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/conf/schema.xml
nutch/branches/2.x/default.properties
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/pom.xml
nutch/branches/2.x/src/bin/nutch
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
nutch/branches/2.x/src/plugin/build.xml
nutch/branches/2.x/src/plugin/nutch-extensionpoints/plugin.xml
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jan 15 12:01:45 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1568 port pluggable indexing architecture to 2.x (Talat UYARER via lewismc)
+
* NUTCH-1672 Inlinks are added twice in DbUpdateReducer (Tien Nguyen Manh via lewismc)
* NUTCH-1667 Updatedb always ignore batchId (Tien Nguyen Manh via lewismc)
Modified: nutch/branches/2.x/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Wed Jan 15 12:01:45 2014
@@ -154,6 +154,7 @@
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
+ <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
<packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -201,6 +202,7 @@
<group title="Scoring Plugins" packages="${plugins.scoring}" />
<group title="Parse Plugins" packages="${plugins.parse}" />
<group title="Indexing Filter Plugins" packages="${plugins.index}" />
+ <group title="Indexer Plugins" packages="${plugins.indexer}"/>
<group title="Misc. Plugins" packages="${plugins.misc}" />
</javadoc>
<jar jarfile="${maven-javadoc-jar}">
@@ -566,6 +568,7 @@
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
+ <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
<packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -613,6 +616,7 @@
<group title="Scoring Plugins" packages="${plugins.scoring}" />
<group title="Parse Plugins" packages="${plugins.parse}" />
<group title="Indexing Filter Plugins" packages="${plugins.index}" />
+ <group title="Indexer Plugins" packages="${plugins.indexer}"/>
<group title="Misc. Plugins" packages="${plugins.misc}" />
</javadoc>
<!-- Copy the plugin.dtd file to the plugin doc-files dir -->
@@ -928,7 +932,8 @@
<!-- feed is currently disabled
<source path="${basedir}/src/plugin/feed/src/java/" />
<source path="${basedir}/src/plugin/feed/src/test/" /> -->
- <source path="${basedir}/src/plugin/index-anchor/src/java/" />
+ <source path="${basedir}/src/plugin/indexer-solr/src/java/" />
+ <source path="${basedir}/src/plugin/index-anchor/src/java/" />
<source path="${basedir}/src/plugin/index-anchor/src/test/" />
<source path="${basedir}/src/plugin/index-basic/src/java/" />
<source path="${basedir}/src/plugin/index-basic/src/test/" />
Modified: nutch/branches/2.x/conf/log4j.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/conf/log4j.properties (original)
+++ nutch/branches/2.x/conf/log4j.properties Wed Jan 15 12:01:45 2014
@@ -32,12 +32,9 @@ log4j.logger.org.apache.nutch.crawl.DbUp
log4j.logger.org.apache.nutch.host.HostDbUpdateJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.fetcher.FetcherJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.IndexingJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexCleanerJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.CleaningJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Wed Jan 15 12:01:45 2014
@@ -859,7 +859,7 @@
<property>
<name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
+ <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
Modified: nutch/branches/2.x/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Wed Jan 15 12:01:45 2014
@@ -1,124 +1,373 @@
<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ Description: This document contains Solr 4.x schema definition to
+ be used with Solr integration currently build into Nutch.
+ This schema is not minimal, there are some useful field type definitions left,
+ and the set of fields and their flags (indexed/stored/term vectors) can be
+ further optimized depending on needs. See
+ http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+ for more info.
+-->
+
+<schema name="nutch" version="1.5">
+
+ <types>
+
+ <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+
<!--
- Licensed to the Apache Software Foundation (ASF) under one or
- more contributor license agreements. See the NOTICE file
- distributed with this work for additional information regarding
- copyright ownership. The ASF licenses this file to You under the
- Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain
- a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0 Unless required by
- applicable law or agreed to in writing, software distributed
- under the License is distributed on an "AS IS" BASIS, WITHOUT
- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions
- and limitations under the License.
+ Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
-->
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
<!--
- Description: This document contains Solr 3.1 schema definition to
- be used with Solr integration currently build into Nutch. See
- https://issues.apache.org/jira/browse/NUTCH-442
- https://issues.apache.org/jira/browse/NUTCH-699
- https://issues.apache.org/jira/browse/NUTCH-994
- https://issues.apache.org/jira/browse/NUTCH-997
- https://issues.apache.org/jira/browse/NUTCH-1058
- https://issues.apache.org/jira/browse/NUTCH-1394
- and
- http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/
- example/solr/conf/schema.xml?view=markup
- for more info.
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for NumericRangeQuery for internal
+ implementation details.
+
+ Smaller precisionStep values (specified in bits) will lead to more tokens
+ indexed per value, slightly larger index size, and faster range queries.
+ A precisionStep of 0 disables indexing at different precision levels.
-->
-<schema name="nutch" version="1.5">
- <types>
- <fieldType name="string" class="solr.StrField" sortMissingLast="true"
- omitNorms="true"/>
- <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
- omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
- omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
- omitNorms="true" positionIncrementGap="0"/>
-
- <fieldType name="text" class="solr.TextField"
- positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.StopFilterFactory"
- ignoreCase="true" words="stopwords.txt"/>
- <filter class="solr.WordDelimiterFilterFactory"
- generateWordParts="1" generateNumberParts="1"
- catenateWords="1" catenateNumbers="1" catenateAll="0"
- splitOnCaseChange="1"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.EnglishPorterFilterFactory"
- protected="protwords.txt"/>
- <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="url" class="solr.TextField"
- positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.WordDelimiterFilterFactory"
- generateWordParts="1" generateNumberParts="1"/>
- </analyzer>
- </fieldType>
- </types>
- <fields>
- <field name="id" type="string" stored="true" indexed="true"/>
-
- <!-- core fields -->
- <field name="batchId" type="string" stored="true" indexed="false"/>
- <field name="digest" type="string" stored="true" indexed="false"/>
- <field name="boost" type="float" stored="true" indexed="false"/>
-
- <!-- fields for index-basic plugin -->
- <field name="host" type="url" stored="false" indexed="true"/>
- <field name="url" type="url" stored="true" indexed="true"
- required="true"/>
- <field name="content" type="text" stored="false" indexed="true"/>
- <field name="title" type="text" stored="true" indexed="true"/>
- <field name="cache" type="string" stored="true" indexed="false"/>
- <field name="tstamp" type="date" stored="true" indexed="false"/>
-
- <!-- fields for index-anchor plugin -->
- <field name="anchor" type="string" stored="true" indexed="true"
- multiValued="true"/>
-
- <!-- fields for index-more plugin -->
- <field name="type" type="string" stored="true" indexed="true"
- multiValued="true"/>
- <field name="contentLength" type="long" stored="true"
- indexed="false"/>
- <field name="lastModified" type="date" stored="true"
- indexed="false"/>
- <field name="date" type="date" stored="true" indexed="true"/>
-
- <!-- fields for languageidentifier plugin -->
- <field name="lang" type="string" stored="true" indexed="true"/>
-
- <!-- fields for subcollection plugin -->
- <field name="subcollection" type="string" stored="true"
- indexed="true" multiValued="true"/>
-
- <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
- <field name="author" type="string" stored="true" indexed="true"/>
- <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
- <field name="feed" type="string" stored="true" indexed="true"/>
- <field name="publishedDate" type="date" stored="true"
- indexed="true"/>
- <field name="updatedDate" type="date" stored="true"
- indexed="true"/>
-
- <!-- fields for creativecommons plugin -->
- <field name="cc" type="string" stored="true" indexed="true"
- multiValued="true"/>
-
- <!-- fields for tld plugin -->
- <field name="tld" type="string" stored="false" indexed="false"/>
- </fields>
- <uniqueKey>id</uniqueKey>
- <defaultSearchField>content</defaultSearchField>
- <solrQueryParser defaultOperator="OR"/>
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+ is a more restricted form of the canonical representation of dateTime
+ http://www.w3.org/TR/xmlschema-2/#dateTime
+ The trailing "Z" designates UTC time and is mandatory.
+ Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+ All other components are mandatory.
+
+ Expressions can also be used to denote calculations that should be
+ performed relative to "NOW" to determine the value, ie...
+
+ NOW/HOUR
+ ... Round to the start of the current hour
+ NOW-1DAY
+ ... Exactly 1 day prior to now
+ NOW/DAY+6MONTHS+3DAYS
+ ... 6 months and 3 days in the future from the start of
+ the current day
+
+ Consult the DateField javadocs for more information.
+
+ Note: For faster range queries, consider the tdate type
+ -->
+ <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+
+ <!-- A Trie based date field for faster date range queries and date faceting. -->
+ <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+
+ <!-- solr.TextField allows the specification of custom text analyzers
+ specified as a tokenizer and a list of token filters. Different
+ analyzers may be specified for indexing and querying.
+
+ The optional positionIncrementGap puts space between multiple fields of
+ this type on the same document, with the purpose of preventing false phrase
+ matching across fields.
+
+ For more info on customizing your analyzer chain, please see
+ http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+ -->
+
+ <!-- A general text field that has reasonable, generic
+ cross-language defaults: it tokenizes with StandardTokenizer,
+ removes stop words from case-insensitive "stopwords.txt"
+ (empty by default), and down cases. At query time only, it
+ also applies synonyms. -->
+ <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English: it
+ tokenizes with StandardTokenizer, removes English stop words
+ (stopwords.txt), down cases, protects words from protwords.txt, and
+ finally applies Porter's stemming. The query time analyzer
+ also applies synonyms from synonyms.txt. -->
+ <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English, plus
+ aggressive word-splitting and autophrase features enabled.
+ This field is just like text_en, except it adds
+ WordDelimiterFilter to enable splitting and matching of
+ words on case-change, alpha numeric boundaries, and
+ non-alphanumeric chars. This means certain compound word
+ cases will work, for example query "wi fi" will match
+ document "WiFi" or "wi-fi". However, other cases will still
+ not match, for example if the query is "wifi" and the
+ document is "wi fi" or if the query is "wi-fi" and the
+ document is "wifi".
+ -->
+ <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
+ but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
+ <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Just like text_general except it reverses the characters of
+ each token, to enable more efficient leading wildcard queries. -->
+ <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!--
+ The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+ a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
+ Attributes of the DelimitedPayloadTokenFilterFactory :
+ "delimiter" - a one character delimiter. Default is | (pipe)
+ "encoder" - how to encode the following value into a playload
+ float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+ integer -> o.a.l.a.p.IntegerEncoder
+ identity -> o.a.l.a.p.IdentityEncoder
+ Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+ -->
+ <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- lowercases the entire field value, keeping it as a single token. -->
+ <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
+ </analyzer>
+ </fieldType>
+
+
+ <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- since fields of this type are by default not stored or indexed,
+ any data added to them will be ignored outright. -->
+ <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+ </types>
+
+ <fields>
+ <!-- This field is used internally by Solr, for example by features
+ like partial update functionality and update log. It is NOT required
+ if updateLog is turned off in your updateHandler, however it is advised
+ to include it as performance improvements are minimal. -->
+ <field name="_version_" type="long" indexed="true" stored="true"/>
+
+ <field name="id" type="string" stored="true" indexed="true"/>
+
+ <!-- core fields -->
+ <field name="batchId" type="string" stored="true" indexed="false"/>
+ <field name="digest" type="string" stored="true" indexed="false"/>
+ <field name="boost" type="float" stored="true" indexed="false"/>
+
+ <!-- fields for index-basic plugin -->
+ <field name="host" type="url" stored="false" indexed="true"/>
+ <field name="url" type="url" stored="true" indexed="true" required="true"/>
+ <field name="orig" type="url" stored="true" indexed="true" />
+ <!-- stored=true for highlighting, use term vectors and positions for fast highlighting -->
+ <field name="content" type="text_general" stored="true" indexed="true"/>
+ <field name="title" type="text_general" stored="true" indexed="true"/>
+ <field name="cache" type="string" stored="true" indexed="false"/>
+ <field name="tstamp" type="date" stored="true" indexed="false"/>
+
+ <!-- catch-all field -->
+ <field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/>
+
+ <!-- fields for index-anchor plugin -->
+ <field name="anchor" type="text_general" stored="true" indexed="true"
+ multiValued="true"/>
+
+ <!-- fields for index-more plugin -->
+ <field name="type" type="string" stored="true" indexed="true" multiValued="true"/>
+ <field name="contentLength" type="string" stored="true" indexed="false"/>
+ <field name="lastModified" type="date" stored="true" indexed="false"/>
+ <field name="date" type="tdate" stored="true" indexed="true"/>
+
+ <!-- fields for languageidentifier plugin -->
+ <field name="lang" type="string" stored="true" indexed="true"/>
+
+ <!-- fields for subcollection plugin -->
+ <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/>
+
+ <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
+ <field name="author" type="string" stored="true" indexed="true"/>
+ <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
+ <field name="feed" type="string" stored="true" indexed="true"/>
+ <field name="publishedDate" type="date" stored="true" indexed="true"/>
+ <field name="updatedDate" type="date" stored="true" indexed="true"/>
+
+ <!-- fields for creativecommons plugin -->
+ <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>
+
+ <!-- fields for tld plugin -->
+ <field name="tld" type="string" stored="false" indexed="false"/>
+ </fields>
+ <uniqueKey>id</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field differently,
+ or to add multiple fields to the same field for easier/faster searching. -->
+
+ <copyField source="content" dest="text"/>
+ <copyField source="url" dest="text"/>
+ <copyField source="title" dest="text"/>
+ <copyField source="anchor" dest="text"/>
+ <copyField source="author" dest="text"/>
+
</schema>
Modified: nutch/branches/2.x/default.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/default.properties?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/default.properties (original)
+++ nutch/branches/2.x/default.properties Wed Jan 15 12:01:45 2014
@@ -142,6 +142,11 @@ plugins.index=\
org.apache.nutch.indexer.subcollection*:\
org.apache.nutch.indexer.tld*
+# Indexing Backend Plugins
+#
+plugins.indexer=\
+ org.apache.nutch.indexwriter.solr*
+
#
# Misc. Plugins
#
Modified: nutch/branches/2.x/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Wed Jan 15 12:01:45 2014
@@ -35,7 +35,7 @@
<dependency org="org.elasticsearch" name="elasticsearch" rev="0.19.4"
conf="*->default"/>
- <dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0"
+ <dependency org="org.apache.solr" name="solr-solrj" rev="4.6.0"
conf="*->default" />
<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
conf="*->master" />
@@ -44,8 +44,8 @@
conf="*->default" />
<dependency org="commons-collections" name="commons-collections"
rev="3.1" conf="*->default" />
- <dependency org="commons-httpclient" name="commons-httpclient"
- rev="3.1" conf="*->master" />
+ <dependency org="org.apache.httpcomponents" name="httpclient"
+ rev="4.2.5" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.3"
conf="*->default" />
@@ -116,7 +116,7 @@
<dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/>
-->
<!-- Uncomment this to use HBase as Gora backend. -->
- <!--
+ <!--
<dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />
-->
<!-- Uncomment this to use Accumulo as Gora backend. -->
@@ -124,9 +124,9 @@
<dependency org="org.apache.gora" name="gora-accumulo" rev="0.3" conf="*->default" />
-->
<!-- Uncomment this to use Cassandra as Gora backend. -->
- <!--
+
<dependency org="org.apache.gora" name="gora-cassandra" rev="0.3" conf="*->default" />
- -->
+
<!--global exclusion -->
<exclude module="ant" />
Modified: nutch/branches/2.x/pom.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/pom.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/pom.xml (original)
+++ nutch/branches/2.x/pom.xml Wed Jan 15 12:01:45 2014
@@ -117,7 +117,7 @@
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
- <version>3.4.0</version>
+ <version>4.6.0</version>
<optional>true</optional>
</dependency>
<dependency>
@@ -139,9 +139,9 @@
<optional>true</optional>
</dependency>
<dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <version>3.1</version>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ <version>4.2.5</version>
<optional>true</optional>
</dependency>
<dependency>
Modified: nutch/branches/2.x/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Wed Jan 15 12:01:45 2014
@@ -56,10 +56,12 @@ if [ $# = 0 ]; then
echo " updatehostdb update host table after parsing"
echo " readdb read/dump records from page database"
echo " readhostdb display entries from the hostDB"
- echo " elasticindex run the elasticsearch indexer"
- echo " solrindex run the solr indexer on parsed batches"
+ echo " index run the plugin-based indexer on parsed batches"
+ echo " elasticindex run the elasticsearch indexer - DEPRECATED use the index command instead"
+ echo " solrindex run the solr indexer on parsed batches - DEPRECATED use the index command instead"
echo " solrdedup remove duplicates from solr"
- echo " solrclean configurable extension to remove various documents from solr"
+ echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
+ echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
echo " parsechecker check the parser for a given url"
echo " indexchecker check the indexing filters for a given url"
echo " plugin load a plugin and run one of its classes main()"
@@ -207,11 +209,17 @@ CLASS=org.apache.nutch.host.HostDbReader
elif [ "$COMMAND" = "elasticindex" ] ; then
CLASS=org.apache.nutch.indexer.elastic.ElasticIndexerJob
elif [ "$COMMAND" = "solrindex" ] ; then
-CLASS=org.apache.nutch.indexer.solr.SolrIndexerJob
+CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
+shift
+elif [ "$COMMAND" = "index" ] ; then
+CLASS=org.apache.nutch.indexer.IndexingJob
elif [ "$COMMAND" = "solrdedup" ] ; then
CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates
elif [ "$COMMAND" = "solrclean" ] ; then
-CLASS=org.apache.nutch.indexer.solr.SolrClean
+ CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
+ shift; shift
+elif [ "$COMMAND" = "clean" ] ; then
+ CLASS=org.apache.nutch.indexer.CleaningJob
elif [ "$COMMAND" = "parsechecker" ] ; then
CLASS=org.apache.nutch.parse.ParserChecker
elif [ "$COMMAND" = "indexchecker" ] ; then
Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java Wed Jan 15 12:01:45 2014
@@ -38,7 +38,7 @@ import org.apache.nutch.crawl.GeneratorJ
import org.apache.nutch.crawl.InjectorJob;
import org.apache.nutch.crawl.WebTableReader;
import org.apache.nutch.fetcher.FetcherJob;
-import org.apache.nutch.indexer.solr.SolrIndexerJob;
+import org.apache.nutch.indexer.IndexingJob;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.util.NutchTool;
@@ -88,7 +88,7 @@ public class RAMJobManager implements Jo
static {
typeToClass.put(JobType.FETCH, FetcherJob.class);
typeToClass.put(JobType.GENERATE, GeneratorJob.class);
- typeToClass.put(JobType.INDEX, SolrIndexerJob.class);
+ typeToClass.put(JobType.INDEX, IndexingJob.class);
typeToClass.put(JobType.INJECT, InjectorJob.class);
typeToClass.put(JobType.PARSE, ParserJob.class);
typeToClass.put(JobType.UPDATEDB, DbUpdaterJob.class);
Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Map;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.gora.mapreduce.GoraMapper;
+import org.apache.gora.mapreduce.StringComparator;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.hadoop.util.Tool;
+import org.apache.nutch.crawl.CrawlStatus;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.storage.StorageUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.ToolUtil;
+
+public class CleaningJob extends NutchTool implements Tool {
+
+ public static final String ARG_COMMIT = "commit";
+ public static final Logger LOG = LoggerFactory
+ .getLogger(CleaningJob.class);
+ private Configuration conf;
+
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.STATUS);
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Collection<WebPage.Field> getFields(Job job) {
+ Configuration conf = job.getConfiguration();
+ Collection<WebPage.Field> columns = new HashSet<WebPage.Field>(FIELDS);
+ IndexCleaningFilters filters = new IndexCleaningFilters(conf);
+ columns.addAll(filters.getFields());
+ return columns;
+ }
+
+ public static class CleanMapper extends
+ GoraMapper<String, WebPage, String, WebPage> {
+
+ private IndexCleaningFilters filters;
+
+ @Override
+ protected void setup(Context context) throws IOException {
+ Configuration conf = context.getConfiguration();
+ filters = new IndexCleaningFilters(conf);
+ }
+
+ @Override
+ public void map(String key, WebPage page, Context context)
+ throws IOException, InterruptedException {
+ try {
+ if (page.getStatus() == CrawlStatus.STATUS_GONE
+ || filters.remove(key, page)) {
+ context.write(key, page);
+ }
+ } catch (IndexingException e) {
+ LOG.warn("Error indexing " + key + ": " + e);
+ }
+ }
+ }
+
+ public static class CleanReducer extends
+ Reducer<String, WebPage, NullWritable, NullWritable> {
+ private int numDeletes = 0;
+ private static final int NUM_MAX_DELETE_REQUEST = 1000;
+ private boolean commit;
+ IndexWriters writers = null;
+
+ @Override
+ public void setup(Context job) throws IOException {
+ Configuration conf = job.getConfiguration();
+ writers = new IndexWriters(conf);
+ try {
+ writers.open(conf);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ commit = conf.getBoolean(ARG_COMMIT, false);
+ }
+
+ public void reduce(String key, Iterable<WebPage> values, Context context)
+ throws IOException {
+ writers.delete(key);
+ numDeletes++;
+ context.getCounter("SolrClean", "DELETED").increment(1);
+ }
+
+ @Override
+ public void cleanup(Context context) throws IOException {
+ writers.close();
+ if (numDeletes > 0 && !commit) {
+ writers.commit();
+ }
+ LOG.info("CleaningJob: deleted a total of " + numDeletes + " documents");
+ }
+ }
+
+
+ @Override
+ public Map<String, Object> run(Map<String, Object> args) throws Exception {
+ getConf().setBoolean(ARG_COMMIT, (Boolean) args.get(ARG_COMMIT));
+ currentJob = new NutchJob(getConf(), "CleaningJob");
+ currentJob.getConfiguration().setClass(
+ "mapred.output.key.comparator.class", StringComparator.class,
+ RawComparator.class);
+
+ Collection<WebPage.Field> fields = getFields(currentJob);
+ StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class,
+ CleanMapper.class);
+ currentJob.setReducerClass(CleanReducer.class);
+ currentJob.setOutputFormatClass(NullOutputFormat.class);
+ currentJob.waitForCompletion(true);
+ ToolUtil.recordJobStatus(null, currentJob, results);
+ return results;
+ }
+
+ public int delete(boolean commit) throws Exception {
+ LOG.info("CleaningJob: starting");
+ run(ToolUtil.toArgMap(ARG_COMMIT, commit));
+ LOG.info("CleaningJob: done");
+ return 0;
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 1) {
+ System.err.println("Usage: CleaningJob [-crawlId <id>] [-noCommit]");
+ return 1;
+ }
+
+ boolean commit = true;
+ if (args.length == 3 && args[2].equals("-noCommit")) {
+ commit = false;
+ }
+ if (args.length == 3 && "-crawlId".equals(args[0])) {
+ getConf().set(Nutch.CRAWL_ID_KEY, args[1]);
+ }
+
+ return delete(commit);
+ }
+
+ public static void main(String[] args) throws Exception {
+ int result = ToolRunner.run(NutchConfiguration.create(), new CleaningJob(),
+ args);
+ System.exit(result);
+ }
+
+}
Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.plugin.Pluggable;
+
+public interface IndexWriter extends Configurable, Pluggable {
+ /** The name of the extension point. */
+ final static String X_POINT_ID = IndexWriter.class.getName();
+
+ public void open(Configuration job) throws IOException;
+
+ public void write(NutchDocument doc) throws IOException;
+
+ public void delete(String key) throws IOException;
+
+ public void update(NutchDocument doc) throws IOException;
+
+ public void commit() throws IOException;
+
+ public void close() throws IOException;
+
+ /** Returns a String describing the IndexWriter instance and the specific parameters it can take */
+ public String describe();
+}
Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.ObjectCache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Creates and caches {@link IndexWriter} implementing plugins. */
+public class IndexWriters {
+
+ public final static Logger LOG = LoggerFactory
+ .getLogger(IndexWriters.class);
+
+ private IndexWriter[] indexWriters;
+
+ public IndexWriters(Configuration conf) {
+ ObjectCache objectCache = ObjectCache.get(conf);
+ synchronized (objectCache) {
+ this.indexWriters = (IndexWriter[]) objectCache
+ .getObject(IndexWriter.class.getName());
+ if (this.indexWriters == null) {
+ try {
+ ExtensionPoint point = PluginRepository.get(conf)
+ .getExtensionPoint(IndexWriter.X_POINT_ID);
+ if (point == null)
+ throw new RuntimeException(IndexWriter.X_POINT_ID
+ + " not found.");
+ Extension[] extensions = point.getExtensions();
+ HashMap<String, IndexWriter> indexerMap = new HashMap<String, IndexWriter>();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ IndexWriter writer = (IndexWriter) extension
+ .getExtensionInstance();
+ LOG.info("Adding " + writer.getClass().getName());
+ if (!indexerMap.containsKey(writer.getClass().getName())) {
+ indexerMap.put(writer.getClass().getName(), writer);
+ }
+ }
+ objectCache.setObject(IndexWriter.class.getName(), indexerMap
+ .values().toArray(new IndexWriter[0]));
+ } catch (PluginRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+ this.indexWriters = (IndexWriter[]) objectCache
+ .getObject(IndexWriter.class.getName());
+ }
+ }
+ }
+
+ public void open(Configuration conf) throws IOException {
+ for (int i = 0; i < this.indexWriters.length; i++) {
+ try {
+ this.indexWriters[i].open(conf);
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ }
+ }
+
+ public void write(NutchDocument doc) throws IOException {
+ for (int i = 0; i < this.indexWriters.length; i++) {
+ try {
+ this.indexWriters[i].write(doc);
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ }
+ }
+
+ public void update(NutchDocument doc) throws IOException {
+ for (int i = 0; i < this.indexWriters.length; i++) {
+ try {
+ this.indexWriters[i].update(doc);
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ }
+ }
+
+ public void delete(String key) throws IOException {
+ for (int i = 0; i < this.indexWriters.length; i++) {
+ try {
+ this.indexWriters[i].delete(key);
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ }
+ }
+
+ public void close() throws IOException {
+ for (int i = 0; i < this.indexWriters.length; i++) {
+ try {
+ this.indexWriters[i].close();
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ }
+ }
+
+ public void commit() throws IOException {
+ for (int i = 0; i < this.indexWriters.length; i++) {
+ try {
+ this.indexWriters[i].commit();
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ }
+ }
+
+ // lists the active IndexWriters and their configuration
+ public String describe() throws IOException {
+ StringBuffer buffer = new StringBuffer();
+ if (this.indexWriters.length == 0)
+ buffer.append("No IndexWriters activated - check your configuration\n");
+ else
+ buffer.append("Active IndexWriters :\n");
+ for (int i = 0; i < this.indexWriters.length; i++) {
+ buffer.append(this.indexWriters[i].describe()).append("\n");
+ }
+ return buffer.toString();
+ }
+
+}
\ No newline at end of file
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java Wed Jan 15 12:01:45 2014
@@ -30,29 +30,29 @@ public class IndexerOutputFormat extends
public RecordWriter<String, NutchDocument> getRecordWriter(
TaskAttemptContext job) throws IOException, InterruptedException {
- final NutchIndexWriter[] writers =
- NutchIndexWriterFactory.getNutchIndexWriters(job.getConfiguration());
-
- for (final NutchIndexWriter writer : writers) {
- writer.open(job);
- }
+ //final IndexWriter[] writers =
+ // NutchIndexWriterFactory.getNutchIndexWriters(job.getConfiguration());
+ final IndexWriters writers = new IndexWriters(job.getConfiguration());
+
+// for (final IndexWriter writer : writers) {
+// writer.open(job);
+// }
+ writers.open(job.getConfiguration());
+
return new RecordWriter<String, NutchDocument>() {
@Override
public void write(String key, NutchDocument doc) throws IOException {
- for (final NutchIndexWriter writer : writers) {
- writer.write(doc);
- }
+ // TODO: Check Write Status for delete or write.
+ writers.write(doc);
}
@Override
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
- for (final NutchIndexWriter writer : writers) {
- writer.close();
+ writers.close();
}
- }
};
}
Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Map;
+
+import org.apache.avro.util.Utf8;
+import org.apache.gora.mapreduce.GoraMapper;
+import org.apache.gora.mapreduce.StringComparator;
+import org.apache.gora.store.DataStore;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.GeneratorJob;
+import org.apache.nutch.indexer.solr.SolrConstants;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseStatusCodes;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.StorageUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TableUtil;
+import org.apache.nutch.util.ToolUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class IndexingJob extends NutchTool implements Tool {
+
+ public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class);
+
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ private static final Utf8 REINDEX = new Utf8("-reindex");
+
+ static {
+ FIELDS.add(WebPage.Field.SIGNATURE);
+ FIELDS.add(WebPage.Field.PARSE_STATUS);
+ FIELDS.add(WebPage.Field.SCORE);
+ FIELDS.add(WebPage.Field.MARKERS);
+ }
+
+ public static class IndexerMapper extends
+ GoraMapper<String, WebPage, String, NutchDocument> {
+ public IndexUtil indexUtil;
+ public DataStore<String, WebPage> store;
+
+ protected Utf8 batchId;
+
+ @Override
+ public void setup(Context context) throws IOException {
+ Configuration conf = context.getConfiguration();
+ batchId = new Utf8(
+ conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+ indexUtil = new IndexUtil(conf);
+ try {
+ store = StorageUtils.createWebStore(conf, String.class, WebPage.class);
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ }
+ }
+
+ protected void cleanup(Context context) throws IOException,
+ InterruptedException {
+ store.close();
+ };
+
+ @Override
+ public void map(String key, WebPage page, Context context)
+ throws IOException, InterruptedException {
+ ParseStatus pstatus = page.getParseStatus();
+ if (pstatus == null || !ParseStatusUtils.isSuccess(pstatus)
+ || pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
+ return; // filter urls not parsed
+ }
+
+ Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page);
+ if (!batchId.equals(REINDEX)) {
+ if (!NutchJob.shouldProcess(mark, batchId)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
+ + "; different batch id (" + mark + ")");
+ }
+ return;
+ }
+ }
+
+ NutchDocument doc = indexUtil.index(key, page);
+ if (doc == null) {
+ return;
+ }
+ if (mark != null) {
+ Mark.INDEX_MARK.putMark(page, Mark.UPDATEDB_MARK.checkMark(page));
+ store.put(key, page);
+ }
+ context.write(key, doc);
+ context.getCounter("IndexerJob", "DocumentCount").increment(1);
+ }
+ }
+
+ private static Collection<WebPage.Field> getFields(Job job) {
+ Configuration conf = job.getConfiguration();
+ Collection<WebPage.Field> columns = new HashSet<WebPage.Field>(FIELDS);
+ IndexingFilters filters = new IndexingFilters(conf);
+ columns.addAll(filters.getFields());
+ ScoringFilters scoringFilters = new ScoringFilters(conf);
+ columns.addAll(scoringFilters.getFields());
+ return columns;
+ }
+
+ @Override
+ public Map<String, Object> run(Map<String, Object> args) throws Exception {
+ String batchId = (String) args.get(Nutch.ARG_BATCH);
+
+ Configuration conf = getConf();
+ conf.set(GeneratorJob.BATCH_ID, batchId);
+
+ Job job = new NutchJob(conf, "Indexer");
+ // TODO: Figure out why this needs to be here
+ job.getConfiguration().setClass("mapred.output.key.comparator.class",
+ StringComparator.class, RawComparator.class);
+
+ Collection<WebPage.Field> fields = getFields(job);
+ StorageUtils.initMapperJob(job, fields, String.class, NutchDocument.class,
+ IndexerMapper.class);
+ job.setNumReduceTasks(0);
+ job.setOutputFormatClass(IndexerOutputFormat.class);
+
+ job.waitForCompletion(true);
+ ToolUtil.recordJobStatus(null, job, results);
+ return results;
+ }
+
+ public void index(String batchId) throws Exception {
+ LOG.info("IndexingJob: starting");
+
+ run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId));
+ // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
+ // do the commits once and for all the reducers in one go
+ // getConf().set(SolrConstants.SERVER_URL,solrUrl);
+
+ IndexWriters writers = new IndexWriters(getConf());
+ LOG.info(writers.describe());
+
+ writers.open(getConf());
+ if (getConf().getBoolean(SolrConstants.COMMIT_INDEX, true)) {
+ writers.commit();
+ }
+ LOG.info("IndexingJob: done.");
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 1) {
+ System.err
+ .println("Usage: IndexingJob (<batchId> | -all | -reindex) [-crawlId <id>]");
+ return -1;
+ }
+
+ if (args.length == 3 && "-crawlId".equals(args[1])) {
+ getConf().set(Nutch.CRAWL_ID_KEY, args[2]);
+ }
+ try {
+ index(args[0]);
+ return 0;
+ } catch (final Exception e) {
+ LOG.error("SolrIndexerJob: " + StringUtils.stringifyException(e));
+ return -1;
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ final int res = ToolRunner.run(NutchConfiguration.create(),
+ new IndexingJob(), args);
+ System.exit(res);
+ }
+}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Wed Jan 15 12:01:45 2014
@@ -45,7 +45,7 @@ import org.apache.nutch.util.NutchConfig
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
@@ -241,7 +241,7 @@ implements Tool {
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int numSplits = context.getNumReduceTasks();
- SolrServer solr = SolrUtils.getCommonsHttpSolrServer(conf);
+ SolrServer solr = SolrUtils.getHttpSolrServer(conf);
final SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
solrQuery.setFields(SolrConstants.ID_FIELD);
@@ -271,7 +271,7 @@ implements Tool {
public RecordReader<Text, SolrRecord> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
- SolrServer solr = SolrUtils.getCommonsHttpSolrServer(conf);
+ SolrServer solr = SolrUtils.getHttpSolrServer(conf);
SolrInputSplit solrSplit = (SolrInputSplit) split;
final int numDocs = (int) solrSplit.getLength();
@@ -315,11 +315,7 @@ implements Tool {
@Override
public void setup(Context job) throws IOException {
Configuration conf = job.getConfiguration();
- try {
- solr = SolrUtils.getCommonsHttpSolrServer(conf);
- } catch (MalformedURLException e) {
- throw new IOException(e);
- }
+ solr = SolrUtils.getHttpSolrServer(conf);
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java Wed Jan 15 12:01:45 2014
@@ -1,23 +1,25 @@
package org.apache.nutch.indexer.solr;
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.auth.AuthScope;
-import org.apache.commons.httpclient.UsernamePasswordCredentials;
-import org.apache.commons.httpclient.params.HttpClientParams;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.params.HttpClientParams;
+import org.apache.http.params.HttpParams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.nutch.indexer.solr.SolrConstants;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
import java.net.MalformedURLException;
public class SolrUtils {
- public static Logger LOG = LoggerFactory.getLogger(SolrIndexerJob.class);
+ public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
- public static CommonsHttpSolrServer getCommonsHttpSolrServer(Configuration job) throws MalformedURLException {
- HttpClient client=new HttpClient();
+ public static HttpSolrServer getHttpSolrServer(Configuration job) throws MalformedURLException {
+ DefaultHttpClient client = new DefaultHttpClient();
// Check for username/password
if (job.getBoolean(SolrConstants.USE_AUTH, false)) {
@@ -27,15 +29,16 @@ public class SolrUtils {
AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
- client.getState().setCredentials(scope, new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
+ client.getCredentialsProvider().setCredentials(scope,
+ new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
- HttpClientParams params = client.getParams();
- params.setAuthenticationPreemptive(true);
+ HttpParams params = client.getParams();
+ HttpClientParams.setAuthenticating(params, true);
client.setParams(params);
}
- return new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL), client);
+ return new HttpSolrServer(job.get(SolrConstants.SERVER_URL), client);
}
public static String stripNonCharCodepoints(String input) {
Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Wed Jan 15 12:01:45 2014
@@ -30,6 +30,7 @@
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-more" target="deploy"/>
+ <ant dir="indexer-solr" target="deploy"/>
<ant dir="language-identifier" target="deploy"/>
<ant dir="lib-http" target="deploy"/>
<ant dir="lib-nekohtml" target="deploy"/>
@@ -110,6 +111,7 @@
<ant dir="index-anchor" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-more" target="clean"/>
+ <ant dir="indexer-solr" target="clean"/>
<ant dir="language-identifier" target="clean"/>
<ant dir="lib-http" target="clean"/>
<ant dir="lib-nekohtml" target="clean"/>
Added: nutch/branches/2.x/src/plugin/indexer-solr/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/build.xml?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/build.xml (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/build.xml Wed Jan 15 12:01:45 2014
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-solr" default="jar-core">
+
+ <import file="../build-plugin.xml" />
+
+</project>
Added: nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml Wed Jan 15 12:01:45 2014
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="org.apache.solr" name="solr-solrj" rev="4.6.0"
+ conf="*->default"/>
+ </dependencies>
+
+</ivy-module>
Added: nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml Wed Jan 15 12:01:45 2014
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin id="indexer-solr" name="SOLRIndexWriter" version="1.0.0"
+ provider-name="nutch.apache.org">
+
+ <runtime>
+ <library name="indexer-solr.jar">
+ <export name="*" />
+ </library>
+
+ <library name="activation-1.1.jar"/>
+ <!--library name="commons-codec-1.4.jar"/-->
+ <library name="httpclient-4.2.5.jar"/>
+ <!--library name="commons-io-1.4.jar"/-->
+ <library name="commons-logging-1.1.1.jar"/>
+ <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
+ <library name="jline-0.9.1.jar"/>
+ <library name="log4j-1.2.15.jar"/>
+ <!--library name="lucene-core-4.4.0.jar"/-->
+ <library name="mail-1.4.1.jar"/>
+ <!--library name="slf4j-api-1.6.1.jar"/-->
+ <library name="solr-solrj-4.6.0.jar"/>
+ <library name="stax-api-1.0.1.jar"/>
+ <library name="wstx-asl-3.2.7.jar"/>
+ <!--library name="zookeeper-3.3.1.jar"/-->
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints" />
+ </requires>
+
+ <extension id="org.apache.nutch.indexwriter.solr"
+ name="SOLR Index Writer"
+ point="org.apache.nutch.indexer.IndexWriter">
+ <implementation id="SOLRIndexWriter"
+ class="org.apache.nutch.indexwriter.solr.SolrIndexWriter" />
+ </extension>
+
+</plugin>
Added: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+public interface SolrConstants {
+ public static final String SOLR_PREFIX = "solr.";
+
+ public static final String SERVER_URL = SOLR_PREFIX + "server.url";
+
+ public static final String COMMIT_SIZE = SOLR_PREFIX + "commit.size";
+
+ public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
+
+ public static final String MAPPING_FILE = SOLR_PREFIX + "mapping.file";
+
+ public static final String USE_AUTH = SOLR_PREFIX + "auth";
+
+ public static final String USERNAME = SOLR_PREFIX + "auth.username";
+
+ public static final String PASSWORD = SOLR_PREFIX + "auth.password";
+
+ public static final String ID_FIELD = "id";
+
+ public static final String URL_FIELD = "url";
+
+ public static final String BOOST_FIELD = "boost";
+
+ public static final String TIMESTAMP_FIELD = "tstamp";
+
+ public static final String DIGEST_FIELD = "digest";
+
+}
Added: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.common.SolrInputDocument;
+
+public class SolrIndexWriter implements IndexWriter {
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger(SolrIndexWriter.class);
+
+ private HttpSolrServer solr;
+ private SolrMappingReader solrMapping;
+
+ private Configuration config;
+
+ private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>();
+
+ private int batchSize;
+ private int numDeletes = 0;
+ private boolean delete = false;
+
+ protected static long documentCount = 0;
+
+ @Override
+ public void open(Configuration conf) throws IOException {
+ solr = SolrUtils.getHttpSolrServer(conf);
+ batchSize = conf.getInt(SolrConstants.COMMIT_SIZE, 1000);
+ solrMapping = SolrMappingReader.getInstance(conf);
+ }
+
+ @Override
+ public void write(NutchDocument doc) throws IOException {
+ final SolrInputDocument inputDoc = new SolrInputDocument();
+ for (final Entry<String, List<String>> e : doc) {
+ for (final String val : e.getValue()) {
+
+ Object val2 = val;
+ if (e.getKey().equals("content") || e.getKey().equals("title")) {
+ val2 = SolrUtils.stripNonCharCodepoints(val);
+ }
+
+ inputDoc.addField(solrMapping.mapKey(e.getKey()), val2);
+ String sCopy = solrMapping.mapCopyKey(e.getKey());
+ if (sCopy != e.getKey()) {
+ inputDoc.addField(sCopy, val2);
+ }
+ }
+ }
+ inputDoc.setDocumentBoost(doc.getScore());
+ inputDocs.add(inputDoc);
+ documentCount++;
+ if (inputDocs.size() >= batchSize) {
+ try {
+ LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents");
+ solr.add(inputDocs);
+ } catch (final SolrServerException e) {
+ throw new IOException(e);
+ }
+ inputDocs.clear();
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ if (!inputDocs.isEmpty()) {
+ LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents");
+ solr.add(inputDocs);
+ inputDocs.clear();
+ } else if (numDeletes > 0) {
+ LOG.info("Deleted " + Integer.toString(numDeletes) + " documents");
+ }
+ } catch (final SolrServerException e) {
+ throw new IOException(e);
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return config;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ config = conf;
+ String serverURL = conf.get(SolrConstants.SERVER_URL);
+ if (serverURL == null) {
+ String message = "Missing SOLR URL. Should be set via -D "
+ + SolrConstants.SERVER_URL;
+ message += "\n" + describe();
+ LOG.error(message);
+ throw new RuntimeException(message);
+ }
+ }
+
+ @Override
+ public void delete(String key) throws IOException {
+ if (delete) {
+ try {
+ solr.deleteById(key);
+ numDeletes++;
+ } catch (final SolrServerException e) {
+ throw makeIOException(e);
+ }
+ }
+ }
+
+ @Override
+ public void update(NutchDocument doc) throws IOException {
+ write(doc);
+ }
+
+ @Override
+ public void commit() throws IOException {
+ try {
+ solr.commit();
+ LOG.info("Total " + documentCount + (documentCount > 1 ? " documents are " : " document is ") + "added.");
+ } catch (SolrServerException e) {
+ throw makeIOException(e);
+ }
+ }
+
+ public static IOException makeIOException(SolrServerException e) {
+ final IOException ioe = new IOException();
+ ioe.initCause(e);
+ return ioe;
+ }
+
+ @Override
+ public String describe() {
+ StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
+ sb.append("\t").append(SolrConstants.SERVER_URL)
+ .append(" : URL of the SOLR instance (mandatory)\n");
+ sb.append("\t").append(SolrConstants.COMMIT_SIZE)
+ .append(" : buffer size when sending to SOLR (default 1000)\n");
+ sb.append("\t")
+ .append(SolrConstants.MAPPING_FILE)
+ .append(
+ " : name of the mapping file for fields (default solrindex-mapping.xml)\n");
+ sb.append("\t").append(SolrConstants.USE_AUTH)
+ .append(" : use authentication (default false)\n");
+ sb.append("\t").append(SolrConstants.USERNAME)
+ .append(" : use authentication (default false)\n");
+ sb.append("\t").append(SolrConstants.USE_AUTH)
+ .append(" : username for authentication\n");
+ sb.append("\t").append(SolrConstants.PASSWORD)
+ .append(" : password for authentication\n");
+ return sb.toString();
+ }
+
+}