You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/15 13:01:47 UTC

svn commit: r1558349 [1/2] - in /nutch/branches/2.x: ./ conf/ ivy/ src/bin/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/indexer/elastic/ src/java/org/apache/nutch/indexer/solr/ src/plugin/ src/plugin...

Author: lewismc
Date: Wed Jan 15 12:01:45 2014
New Revision: 1558349

URL: http://svn.apache.org/r1558349
Log:
NUTCH-1568 port pluggable indexing architecture to 2.x

Added:
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
    nutch/branches/2.x/src/plugin/indexer-solr/
    nutch/branches/2.x/src/plugin/indexer-solr/build.xml
    nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml
    nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml
    nutch/branches/2.x/src/plugin/indexer-solr/src/
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
    nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
Removed:
    nutch/branches/2.x/conf/schema-solr4.xml
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexCleanerJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriterFactory.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/elastic/ElasticConstants.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/elastic/ElasticIndexerJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/elastic/ElasticWriter.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrClean.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/build.xml
    nutch/branches/2.x/conf/log4j.properties
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/conf/schema.xml
    nutch/branches/2.x/default.properties
    nutch/branches/2.x/ivy/ivy.xml
    nutch/branches/2.x/pom.xml
    nutch/branches/2.x/src/bin/nutch
    nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
    nutch/branches/2.x/src/plugin/build.xml
    nutch/branches/2.x/src/plugin/nutch-extensionpoints/plugin.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jan 15 12:01:45 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1568 port pluggable indexing architecture to 2.x (Talat UYARER via lewismc)
+
 * NUTCH-1672 Inlinks are added twice in DbUpdateReducer (Tien Nguyen Manh via lewismc)
 
 * NUTCH-1667 Updatedb always ignore batchId (Tien Nguyen Manh via lewismc)

Modified: nutch/branches/2.x/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Wed Jan 15 12:01:45 2014
@@ -154,6 +154,7 @@
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
+      <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -201,6 +202,7 @@
    <group title="Scoring Plugins" packages="${plugins.scoring}" />
    <group title="Parse Plugins" packages="${plugins.parse}" />
    <group title="Indexing Filter Plugins" packages="${plugins.index}" />
+   <group title="Indexer Plugins" packages="${plugins.indexer}"/>
    <group title="Misc. Plugins" packages="${plugins.misc}" />
   </javadoc>
   <jar jarfile="${maven-javadoc-jar}">
@@ -566,6 +568,7 @@
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
+      <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -613,6 +616,7 @@
    <group title="Scoring Plugins" packages="${plugins.scoring}" />
    <group title="Parse Plugins" packages="${plugins.parse}" />
    <group title="Indexing Filter Plugins" packages="${plugins.index}" />
+   <group title="Indexer Plugins" packages="${plugins.indexer}"/>
    <group title="Misc. Plugins" packages="${plugins.misc}" />
   </javadoc>
   <!-- Copy the plugin.dtd file to the plugin doc-files dir -->
@@ -928,7 +932,8 @@
         <!-- feed is currently disabled 
         <source path="${basedir}/src/plugin/feed/src/java/" />
         <source path="${basedir}/src/plugin/feed/src/test/" /> -->
-        <source path="${basedir}/src/plugin/index-anchor/src/java/" />
+        <source path="${basedir}/src/plugin/indexer-solr/src/java/" />
+      	<source path="${basedir}/src/plugin/index-anchor/src/java/" />
         <source path="${basedir}/src/plugin/index-anchor/src/test/" />
         <source path="${basedir}/src/plugin/index-basic/src/java/" />
         <source path="${basedir}/src/plugin/index-basic/src/test/" />

Modified: nutch/branches/2.x/conf/log4j.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/conf/log4j.properties (original)
+++ nutch/branches/2.x/conf/log4j.properties Wed Jan 15 12:01:45 2014
@@ -32,12 +32,9 @@ log4j.logger.org.apache.nutch.crawl.DbUp
 log4j.logger.org.apache.nutch.host.HostDbUpdateJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.FetcherJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.IndexingJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexCleanerJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.CleaningJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Wed Jan 15 12:01:45 2014
@@ -859,7 +859,7 @@
 
 <property>
   <name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
+ <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
  <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By

Modified: nutch/branches/2.x/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Wed Jan 15 12:01:45 2014
@@ -1,124 +1,373 @@
 <?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+    Description: This document contains Solr 4.x schema definition to
+    be used with Solr integration currently build into Nutch.
+    This schema is not minimal, there are some useful field type definitions left,
+    and the set of fields and their flags (indexed/stored/term vectors) can be
+    further optimized depending on needs.  See
+    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+    for more info.
+-->
+
+<schema name="nutch" version="1.5">
+
+  <types>
+
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+
     <!--
-        Licensed to the Apache Software Foundation (ASF) under one or
-        more contributor license agreements. See the NOTICE file
-        distributed with this work for additional information regarding
-        copyright ownership. The ASF licenses this file to You under the
-        Apache License, Version 2.0 (the "License"); you may not use
-        this file except in compliance with the License. You may obtain
-        a copy of the License at
-        http://www.apache.org/licenses/LICENSE-2.0 Unless required by
-        applicable law or agreed to in writing, software distributed
-        under the License is distributed on an "AS IS" BASIS, WITHOUT
-        WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-        See the License for the specific language governing permissions
-        and limitations under the License.
+      Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
     -->
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
     <!--
-        Description: This document contains Solr 3.1 schema definition to
-        be used with Solr integration currently build into Nutch. See
-        https://issues.apache.org/jira/browse/NUTCH-442
-        https://issues.apache.org/jira/browse/NUTCH-699
-        https://issues.apache.org/jira/browse/NUTCH-994
-        https://issues.apache.org/jira/browse/NUTCH-997
-        https://issues.apache.org/jira/browse/NUTCH-1058
-        https://issues.apache.org/jira/browse/NUTCH-1394
-        and
-        http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/
-        example/solr/conf/schema.xml?view=markup
-        for more info.
+     Numeric field types that index each value at various levels of precision
+     to accelerate range queries when the number of values between the range
+     endpoints is large. See the javadoc for NumericRangeQuery for internal
+     implementation details.
+
+     Smaller precisionStep values (specified in bits) will lead to more tokens
+     indexed per value, slightly larger index size, and faster range queries.
+     A precisionStep of 0 disables indexing at different precision levels.
     -->
-<schema name="nutch" version="1.5">
-    <types>
-        <fieldType name="string" class="solr.StrField" sortMissingLast="true"
-            omitNorms="true"/> 
-        <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
-            omitNorms="true" positionIncrementGap="0"/>
-        <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
-            omitNorms="true" positionIncrementGap="0"/>
-        <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
-            omitNorms="true" positionIncrementGap="0"/>
-
-        <fieldType name="text" class="solr.TextField"
-            positionIncrementGap="100">
-            <analyzer>
-                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-                <filter class="solr.StopFilterFactory"
-                    ignoreCase="true" words="stopwords.txt"/>
-                <filter class="solr.WordDelimiterFilterFactory"
-                    generateWordParts="1" generateNumberParts="1"
-                    catenateWords="1" catenateNumbers="1" catenateAll="0"
-                    splitOnCaseChange="1"/>
-                <filter class="solr.LowerCaseFilterFactory"/>
-                <filter class="solr.EnglishPorterFilterFactory"
-                    protected="protwords.txt"/>
-                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
-            </analyzer>
-        </fieldType>
-        <fieldType name="url" class="solr.TextField"
-            positionIncrementGap="100">
-            <analyzer>
-                <tokenizer class="solr.StandardTokenizerFactory"/>
-                <filter class="solr.LowerCaseFilterFactory"/>
-                <filter class="solr.WordDelimiterFilterFactory"
-                    generateWordParts="1" generateNumberParts="1"/>
-            </analyzer>
-        </fieldType>
-    </types>
-    <fields>
-        <field name="id" type="string" stored="true" indexed="true"/>
-
-        <!-- core fields -->
-        <field name="batchId" type="string" stored="true" indexed="false"/>
-        <field name="digest" type="string" stored="true" indexed="false"/>
-        <field name="boost" type="float" stored="true" indexed="false"/>
-
-        <!-- fields for index-basic plugin -->
-        <field name="host" type="url" stored="false" indexed="true"/>
-        <field name="url" type="url" stored="true" indexed="true"
-            required="true"/>
-        <field name="content" type="text" stored="false" indexed="true"/>
-        <field name="title" type="text" stored="true" indexed="true"/>
-        <field name="cache" type="string" stored="true" indexed="false"/>
-        <field name="tstamp" type="date" stored="true" indexed="false"/>
-
-        <!-- fields for index-anchor plugin -->
-        <field name="anchor" type="string" stored="true" indexed="true"
-            multiValued="true"/>
-
-        <!-- fields for index-more plugin -->
-        <field name="type" type="string" stored="true" indexed="true"
-            multiValued="true"/>
-        <field name="contentLength" type="long" stored="true"
-            indexed="false"/>
-        <field name="lastModified" type="date" stored="true"
-            indexed="false"/>
-        <field name="date" type="date" stored="true" indexed="true"/>
-
-        <!-- fields for languageidentifier plugin -->
-        <field name="lang" type="string" stored="true" indexed="true"/>
-
-        <!-- fields for subcollection plugin -->
-        <field name="subcollection" type="string" stored="true"
-            indexed="true" multiValued="true"/>
-
-        <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
-        <field name="author" type="string" stored="true" indexed="true"/>
-        <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
-        <field name="feed" type="string" stored="true" indexed="true"/>
-        <field name="publishedDate" type="date" stored="true"
-            indexed="true"/>
-        <field name="updatedDate" type="date" stored="true"
-            indexed="true"/>
-
-        <!-- fields for creativecommons plugin -->
-        <field name="cc" type="string" stored="true" indexed="true"
-            multiValued="true"/>
-            
-        <!-- fields for tld plugin -->    
-        <field name="tld" type="string" stored="false" indexed="false"/>
-    </fields>
-    <uniqueKey>id</uniqueKey>
-    <defaultSearchField>content</defaultSearchField>
-    <solrQueryParser defaultOperator="OR"/>
+    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+         is a more restricted form of the canonical representation of dateTime
+         http://www.w3.org/TR/xmlschema-2/#dateTime    
+         The trailing "Z" designates UTC time and is mandatory.
+         Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+         All other components are mandatory.
+
+         Expressions can also be used to denote calculations that should be
+         performed relative to "NOW" to determine the value, ie...
+
+               NOW/HOUR
+                  ... Round to the start of the current hour
+               NOW-1DAY
+                  ... Exactly 1 day prior to now
+               NOW/DAY+6MONTHS+3DAYS
+                  ... 6 months and 3 days in the future from the start of
+                      the current day
+                      
+         Consult the DateField javadocs for more information.
+
+         Note: For faster range queries, consider the tdate type
+      -->
+    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+
+    <!-- A Trie based date field for faster date range queries and date faceting. -->
+    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+
+    <!-- solr.TextField allows the specification of custom text analyzers
+         specified as a tokenizer and a list of token filters. Different
+         analyzers may be specified for indexing and querying.
+
+         The optional positionIncrementGap puts space between multiple fields of
+         this type on the same document, with the purpose of preventing false phrase
+         matching across fields.
+
+         For more info on customizing your analyzer chain, please see
+         http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+     -->
+
+    <!-- A general text field that has reasonable, generic
+         cross-language defaults: it tokenizes with StandardTokenizer,
+	 removes stop words from case-insensitive "stopwords.txt"
+	 (empty by default), and down cases.  At query time only, it
+	 also applies synonyms. -->
+    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English: it
+         tokenizes with StandardTokenizer, removes English stop words
+         (stopwords.txt), down cases, protects words from protwords.txt, and
+         finally applies Porter's stemming.  The query time analyzer
+         also applies synonyms from synonyms.txt. -->
+    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+	<filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+	-->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+	<filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+	-->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English, plus
+	 aggressive word-splitting and autophrase features enabled.
+	 This field is just like text_en, except it adds
+	 WordDelimiterFilter to enable splitting and matching of
+	 words on case-change, alpha numeric boundaries, and
+	 non-alphanumeric chars.  This means certain compound word
+	 cases will work, for example query "wi fi" will match
+	 document "WiFi" or "wi-fi".  However, other cases will still
+	 not match, for example if the query is "wifi" and the
+	 document is "wi fi" or if the query is "wi-fi" and the
+	 document is "wifi".
+        -->
+    <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
+         but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
+    <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+             possible with WordDelimiterFilter in conjuncton with stemming. -->
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Just like text_general except it reverses the characters of
+	 each token, to enable more efficient leading wildcard queries. -->
+    <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype>
+
+    <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!--
+        The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+        a token of "foo|1.4"  would be indexed as "foo" with a payload of 1.4f
+        Attributes of the DelimitedPayloadTokenFilterFactory : 
+         "delimiter" - a one character delimiter. Default is | (pipe)
+	 "encoder" - how to encode the following value into a playload
+	    float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+	    integer -> o.a.l.a.p.IntegerEncoder
+	    identity -> o.a.l.a.p.IdentityEncoder
+            Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+         -->
+        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+      </analyzer>
+    </fieldtype>
+
+    <!-- lowercases the entire field value, keeping it as a single token.  -->
+    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory" />
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+           <filter class="solr.LowerCaseFilterFactory"/>
+           <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
+      </analyzer>
+    </fieldType>
+
+
+    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- since fields of this type are by default not stored or indexed,
+         any data added to them will be ignored outright.  --> 
+    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+ </types>
+
+ <fields>
+    <!-- This field is used internally by Solr, for example by features 
+    like partial update functionality and update log. It is NOT required
+    if updateLog is turned off in your updateHandler, however it is advised
+    to include it as performance improvements are minimal. -->
+    <field name="_version_" type="long" indexed="true" stored="true"/>
+    
+    <field name="id" type="string" stored="true" indexed="true"/>
+
+    <!-- core fields -->
+    <field name="batchId" type="string" stored="true" indexed="false"/>
+    <field name="digest" type="string" stored="true" indexed="false"/>
+    <field name="boost" type="float" stored="true" indexed="false"/>
+
+    <!-- fields for index-basic plugin -->
+    <field name="host" type="url" stored="false" indexed="true"/>
+    <field name="url" type="url" stored="true" indexed="true" required="true"/>
+    <field name="orig" type="url" stored="true" indexed="true" />
+    <!-- stored=true for highlighting, use term vectors  and positions for fast highlighting -->
+    <field name="content" type="text_general" stored="true" indexed="true"/>
+    <field name="title" type="text_general" stored="true" indexed="true"/>
+    <field name="cache" type="string" stored="true" indexed="false"/>
+    <field name="tstamp" type="date" stored="true" indexed="false"/>
+
+    <!-- catch-all field -->
+    <field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/>
+
+    <!-- fields for index-anchor plugin -->
+    <field name="anchor" type="text_general" stored="true" indexed="true"
+        multiValued="true"/>
+
+    <!-- fields for index-more plugin -->
+    <field name="type" type="string" stored="true" indexed="true" multiValued="true"/>
+    <field name="contentLength" type="string" stored="true" indexed="false"/>
+    <field name="lastModified" type="date" stored="true" indexed="false"/>
+    <field name="date" type="tdate" stored="true" indexed="true"/>
+
+    <!-- fields for languageidentifier plugin -->
+    <field name="lang" type="string" stored="true" indexed="true"/>
+
+    <!-- fields for subcollection plugin -->
+    <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/>
+
+    <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
+    <field name="author" type="string" stored="true" indexed="true"/>
+    <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
+    <field name="feed" type="string" stored="true" indexed="true"/>
+    <field name="publishedDate" type="date" stored="true" indexed="true"/>
+    <field name="updatedDate" type="date" stored="true" indexed="true"/>
+
+    <!-- fields for creativecommons plugin -->
+    <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>
+
+    <!-- fields for tld plugin -->    
+    <field name="tld" type="string" stored="false" indexed="false"/>
+ </fields>
+ <uniqueKey>id</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+  <!-- copyField commands copy one field to another at the time a document
+        is added to the index.  It's used either to index the same field differently,
+        or to add multiple fields to the same field for easier/faster searching.  -->
+
+ <copyField source="content" dest="text"/>
+ <copyField source="url" dest="text"/>
+ <copyField source="title" dest="text"/>
+ <copyField source="anchor" dest="text"/>
+ <copyField source="author" dest="text"/>
+
 </schema>

Modified: nutch/branches/2.x/default.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/default.properties?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/default.properties (original)
+++ nutch/branches/2.x/default.properties Wed Jan 15 12:01:45 2014
@@ -142,6 +142,11 @@ plugins.index=\
    org.apache.nutch.indexer.subcollection*:\
    org.apache.nutch.indexer.tld*
 
+# Indexing Backend Plugins
+#
+plugins.indexer=\
+   org.apache.nutch.indexwriter.solr*
+
 #
 # Misc. Plugins
 #

Modified: nutch/branches/2.x/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Wed Jan 15 12:01:45 2014
@@ -35,7 +35,7 @@
     <dependency org="org.elasticsearch" name="elasticsearch" rev="0.19.4" 
                 conf="*->default"/>
   
-    <dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0"
+    <dependency org="org.apache.solr" name="solr-solrj" rev="4.6.0"
       conf="*->default" />
     <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
       conf="*->master" />
@@ -44,8 +44,8 @@
       conf="*->default" />
     <dependency org="commons-collections" name="commons-collections"
       rev="3.1" conf="*->default" />
-    <dependency org="commons-httpclient" name="commons-httpclient"
-      rev="3.1" conf="*->master" />
+    <dependency org="org.apache.httpcomponents" name="httpclient"
+      rev="4.2.5" conf="*->master" />
     <dependency org="commons-codec" name="commons-codec" rev="1.3"
       conf="*->default" />
 
@@ -116,7 +116,7 @@
     <dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/> 
     -->
     <!-- Uncomment this to use HBase as Gora backend. -->
-    <!-- 
+    <!--
     <dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />
     -->
     <!-- Uncomment this to use Accumulo as Gora backend. -->
@@ -124,9 +124,9 @@
     <dependency org="org.apache.gora" name="gora-accumulo" rev="0.3" conf="*->default" />
     -->
     <!-- Uncomment this to use Cassandra as Gora backend. -->
-    <!-- 
+     
     <dependency org="org.apache.gora" name="gora-cassandra" rev="0.3" conf="*->default" />
-    -->
+    
 
     <!--global exclusion -->
     <exclude module="ant" />

Modified: nutch/branches/2.x/pom.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/pom.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/pom.xml (original)
+++ nutch/branches/2.x/pom.xml Wed Jan 15 12:01:45 2014
@@ -117,7 +117,7 @@
                 <dependency>
                         <groupId>org.apache.solr</groupId>
                         <artifactId>solr-solrj</artifactId>
-                        <version>3.4.0</version>
+                        <version>4.6.0</version>
                         <optional>true</optional>
                 </dependency>
                 <dependency>
@@ -139,9 +139,9 @@
                         <optional>true</optional>
                 </dependency>
                 <dependency>
-                        <groupId>commons-httpclient</groupId>
-                        <artifactId>commons-httpclient</artifactId>
-                        <version>3.1</version>
+                        <groupId>org.apache.httpcomponents</groupId>
+                        <artifactId>httpclient</artifactId>
+                        <version>4.2.5</version>
                         <optional>true</optional>
                 </dependency>
                 <dependency>

Modified: nutch/branches/2.x/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/nutch?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/nutch (original)
+++ nutch/branches/2.x/src/bin/nutch Wed Jan 15 12:01:45 2014
@@ -56,10 +56,12 @@ if [ $# = 0 ]; then
   echo " updatehostdb   update host table after parsing"
   echo " readdb 	read/dump records from page database"
   echo " readhostdb     display entries from the hostDB"
-  echo " elasticindex   run the elasticsearch indexer"
-  echo " solrindex 	run the solr indexer on parsed batches"
+  echo " index          run the plugin-based indexer on parsed batches"
+  echo " elasticindex   run the elasticsearch indexer - DEPRECATED use the index command instead"
+  echo " solrindex 	run the solr indexer on parsed batches - DEPRECATED use the index command instead"
   echo " solrdedup 	remove duplicates from solr"
-  echo " solrclean      configurable extension to remove various documents from solr"
+  echo " solrclean      remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
+  echo " clean          remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
   echo " parsechecker   check the parser for a given url"
   echo " indexchecker   check the indexing filters for a given url"
   echo " plugin 	load a plugin and run one of its classes main()"
@@ -207,11 +209,17 @@ CLASS=org.apache.nutch.host.HostDbReader
 elif [ "$COMMAND" = "elasticindex" ] ; then
 CLASS=org.apache.nutch.indexer.elastic.ElasticIndexerJob
 elif [ "$COMMAND" = "solrindex" ] ; then
-CLASS=org.apache.nutch.indexer.solr.SolrIndexerJob
+CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
+shift
+elif [ "$COMMAND" = "index" ] ; then
+CLASS=org.apache.nutch.indexer.IndexingJob
 elif [ "$COMMAND" = "solrdedup" ] ; then
 CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates
 elif [ "$COMMAND" = "solrclean" ] ; then
-CLASS=org.apache.nutch.indexer.solr.SolrClean
+  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
+  shift; shift
+elif [ "$COMMAND" = "clean" ] ; then
+  CLASS=org.apache.nutch.indexer.CleaningJob
 elif [ "$COMMAND" = "parsechecker" ] ; then
   CLASS=org.apache.nutch.parse.ParserChecker
 elif [ "$COMMAND" = "indexchecker" ] ; then

Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java Wed Jan 15 12:01:45 2014
@@ -38,7 +38,7 @@ import org.apache.nutch.crawl.GeneratorJ
 import org.apache.nutch.crawl.InjectorJob;
 import org.apache.nutch.crawl.WebTableReader;
 import org.apache.nutch.fetcher.FetcherJob;
-import org.apache.nutch.indexer.solr.SolrIndexerJob;
+import org.apache.nutch.indexer.IndexingJob;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.ParserJob;
 import org.apache.nutch.util.NutchTool;
@@ -88,7 +88,7 @@ public class RAMJobManager implements Jo
   static {
     typeToClass.put(JobType.FETCH, FetcherJob.class);
     typeToClass.put(JobType.GENERATE, GeneratorJob.class);
-    typeToClass.put(JobType.INDEX, SolrIndexerJob.class);
+    typeToClass.put(JobType.INDEX, IndexingJob.class);
     typeToClass.put(JobType.INJECT, InjectorJob.class);
     typeToClass.put(JobType.PARSE, ParserJob.class);
     typeToClass.put(JobType.UPDATEDB, DbUpdaterJob.class);

Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Map;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.gora.mapreduce.GoraMapper;
+import org.apache.gora.mapreduce.StringComparator;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.hadoop.util.Tool;
+import org.apache.nutch.crawl.CrawlStatus;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.storage.StorageUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.ToolUtil;
+
+public class CleaningJob extends NutchTool implements Tool {
+
+  public static final String ARG_COMMIT = "commit";
+  public static final Logger LOG = LoggerFactory
+      .getLogger(CleaningJob.class);
+  private Configuration conf;
+
+  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+  
+  static {
+    FIELDS.add(WebPage.Field.STATUS);
+  }
+  
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+  
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+  
+  public Collection<WebPage.Field> getFields(Job job) {
+    Configuration conf = job.getConfiguration();
+    Collection<WebPage.Field> columns = new HashSet<WebPage.Field>(FIELDS);
+    IndexCleaningFilters filters = new IndexCleaningFilters(conf);
+    columns.addAll(filters.getFields());
+    return columns;
+  }
+
+  public static class CleanMapper extends
+      GoraMapper<String, WebPage, String, WebPage> {
+
+    private IndexCleaningFilters filters;
+
+    @Override
+    protected void setup(Context context) throws IOException {
+      Configuration conf = context.getConfiguration();
+      filters = new IndexCleaningFilters(conf);
+    }
+
+    @Override
+    public void map(String key, WebPage page, Context context)
+        throws IOException, InterruptedException {
+      try {
+        if (page.getStatus() == CrawlStatus.STATUS_GONE
+            || filters.remove(key, page)) {
+          context.write(key, page);
+        }
+      } catch (IndexingException e) {
+        LOG.warn("Error indexing " + key + ": " + e);
+      }
+    }
+  }
+  
+  public static class CleanReducer extends
+      Reducer<String, WebPage, NullWritable, NullWritable> {
+    private int numDeletes = 0;
+    private static final int NUM_MAX_DELETE_REQUEST = 1000;
+    private boolean commit;
+    IndexWriters writers = null;
+
+    @Override
+    public void setup(Context job) throws IOException {
+      Configuration conf = job.getConfiguration();
+      writers = new IndexWriters(conf);
+      try {
+        writers.open(conf);
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+      commit = conf.getBoolean(ARG_COMMIT, false);
+    }
+
+    public void reduce(String key, Iterable<WebPage> values, Context context)
+        throws IOException {
+      writers.delete(key);
+      numDeletes++;
+      context.getCounter("SolrClean", "DELETED").increment(1);
+    }
+
+    @Override
+    public void cleanup(Context context) throws IOException {
+      writers.close();
+      if (numDeletes > 0 && !commit) {
+        writers.commit();
+      }   
+      LOG.info("CleaningJob: deleted a total of " + numDeletes + " documents");
+    }
+  }
+
+
+  @Override
+  public Map<String, Object> run(Map<String, Object> args) throws Exception {
+    getConf().setBoolean(ARG_COMMIT, (Boolean) args.get(ARG_COMMIT));
+    currentJob = new NutchJob(getConf(), "CleaningJob");
+    currentJob.getConfiguration().setClass(
+        "mapred.output.key.comparator.class", StringComparator.class,
+        RawComparator.class);
+
+    Collection<WebPage.Field> fields = getFields(currentJob);
+    StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class,
+        CleanMapper.class);
+    currentJob.setReducerClass(CleanReducer.class);
+    currentJob.setOutputFormatClass(NullOutputFormat.class);
+    currentJob.waitForCompletion(true);
+    ToolUtil.recordJobStatus(null, currentJob, results);
+    return results;
+  }
+
+  public int delete(boolean commit) throws Exception {
+    LOG.info("CleaningJob: starting");
+    run(ToolUtil.toArgMap(ARG_COMMIT, commit));
+    LOG.info("CleaningJob: done");
+    return 0;
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 1) {
+      System.err.println("Usage: CleaningJob [-crawlId <id>] [-noCommit]");
+      return 1;
+    }
+
+    boolean commit = true;
+    if (args.length == 3 && args[2].equals("-noCommit")) {
+      commit = false;
+    }
+    if (args.length == 3 && "-crawlId".equals(args[0])) {
+      getConf().set(Nutch.CRAWL_ID_KEY, args[1]);
+    }
+
+    return delete(commit);
+  }
+
+  public static void main(String[] args) throws Exception {
+    int result = ToolRunner.run(NutchConfiguration.create(), new CleaningJob(),
+        args);
+    System.exit(result);
+  }
+
+}

Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.plugin.Pluggable;
+
+public interface IndexWriter extends Configurable, Pluggable {
+  /** The name of the extension point. */
+  final static String X_POINT_ID = IndexWriter.class.getName();
+  
+  public void open(Configuration job) throws IOException;
+
+  public void write(NutchDocument doc) throws IOException;
+  
+  public void delete(String key) throws IOException;
+  
+  public void update(NutchDocument doc) throws IOException;
+  
+  public void commit() throws IOException;
+
+  public void close() throws IOException;
+  
+  /** Returns a String describing the IndexWriter instance and the specific parameters it can take */
+  public String describe();
+}

Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.ObjectCache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Creates and caches {@link IndexWriter} implementing plugins. */
+public class IndexWriters {
+
+  public final static Logger LOG = LoggerFactory
+      .getLogger(IndexWriters.class);
+
+  private IndexWriter[] indexWriters;
+
+  public IndexWriters(Configuration conf) {
+    ObjectCache objectCache = ObjectCache.get(conf);
+    synchronized (objectCache) {
+      this.indexWriters = (IndexWriter[]) objectCache
+          .getObject(IndexWriter.class.getName());
+      if (this.indexWriters == null) {
+        try {
+          ExtensionPoint point = PluginRepository.get(conf)
+              .getExtensionPoint(IndexWriter.X_POINT_ID);
+          if (point == null)
+            throw new RuntimeException(IndexWriter.X_POINT_ID
+                + " not found.");
+          Extension[] extensions = point.getExtensions();
+          HashMap<String, IndexWriter> indexerMap = new HashMap<String, IndexWriter>();
+          for (int i = 0; i < extensions.length; i++) {
+            Extension extension = extensions[i];
+            IndexWriter writer = (IndexWriter) extension
+                .getExtensionInstance();
+            LOG.info("Adding " + writer.getClass().getName());
+            if (!indexerMap.containsKey(writer.getClass().getName())) {
+              indexerMap.put(writer.getClass().getName(), writer);
+            }
+          }
+          objectCache.setObject(IndexWriter.class.getName(), indexerMap
+              .values().toArray(new IndexWriter[0]));
+        } catch (PluginRuntimeException e) {
+          throw new RuntimeException(e);
+        }
+        this.indexWriters = (IndexWriter[]) objectCache
+            .getObject(IndexWriter.class.getName());
+      }
+    }
+  }
+
+  public void open(Configuration conf) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].open(conf);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void write(NutchDocument doc) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].write(doc);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void update(NutchDocument doc) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].update(doc);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void delete(String key) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].delete(key);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void close() throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].close();
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void commit() throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].commit();
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  // lists the active IndexWriters and their configuration
+  public String describe() throws IOException {
+    StringBuffer buffer = new StringBuffer();
+    if (this.indexWriters.length == 0)
+      buffer.append("No IndexWriters activated - check your configuration\n");
+    else
+      buffer.append("Active IndexWriters :\n");
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      buffer.append(this.indexWriters[i].describe()).append("\n");
+    }
+    return buffer.toString();
+  }
+
+}
\ No newline at end of file

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java Wed Jan 15 12:01:45 2014
@@ -30,29 +30,29 @@ public class IndexerOutputFormat extends
   public RecordWriter<String, NutchDocument> getRecordWriter(
       TaskAttemptContext job) throws IOException, InterruptedException {
 
-    final NutchIndexWriter[] writers =
-      NutchIndexWriterFactory.getNutchIndexWriters(job.getConfiguration());
-
-    for (final NutchIndexWriter writer : writers) {
-      writer.open(job);
-    }
+    //final IndexWriter[] writers =
+    //  NutchIndexWriterFactory.getNutchIndexWriters(job.getConfiguration());
 
+    final IndexWriters writers = new IndexWriters(job.getConfiguration());
+    
+//    for (final IndexWriter writer : writers) {
+//      writer.open(job);
+//    }
+    writers.open(job.getConfiguration());
+    
     return new RecordWriter<String, NutchDocument>() {
 
       @Override
       public void write(String key, NutchDocument doc) throws IOException {
-        for (final NutchIndexWriter writer : writers) {
-          writer.write(doc);
-        }
+        // TODO: Check Write Status for delete or write.  
+        writers.write(doc);
       }
 
       @Override
       public void close(TaskAttemptContext context) throws IOException,
       InterruptedException {
-        for (final NutchIndexWriter writer : writers) {
-          writer.close();
+          writers.close();
         }
-      }
     };
   }
 

Added: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Map;
+
+import org.apache.avro.util.Utf8;
+import org.apache.gora.mapreduce.GoraMapper;
+import org.apache.gora.mapreduce.StringComparator;
+import org.apache.gora.store.DataStore;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.GeneratorJob;
+import org.apache.nutch.indexer.solr.SolrConstants;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseStatusCodes;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.StorageUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TableUtil;
+import org.apache.nutch.util.ToolUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class IndexingJob extends NutchTool implements Tool {
+
+  public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class);
+
+  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+  private static final Utf8 REINDEX = new Utf8("-reindex");
+
+  static {
+    FIELDS.add(WebPage.Field.SIGNATURE);
+    FIELDS.add(WebPage.Field.PARSE_STATUS);
+    FIELDS.add(WebPage.Field.SCORE);
+    FIELDS.add(WebPage.Field.MARKERS);
+  }
+
+  public static class IndexerMapper extends
+      GoraMapper<String, WebPage, String, NutchDocument> {
+    public IndexUtil indexUtil;
+    public DataStore<String, WebPage> store;
+
+    protected Utf8 batchId;
+
+    @Override
+    public void setup(Context context) throws IOException {
+      Configuration conf = context.getConfiguration();
+      batchId = new Utf8(
+          conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+      indexUtil = new IndexUtil(conf);
+      try {
+        store = StorageUtils.createWebStore(conf, String.class, WebPage.class);
+      } catch (ClassNotFoundException e) {
+        throw new IOException(e);
+      }
+    }
+
+    protected void cleanup(Context context) throws IOException,
+        InterruptedException {
+      store.close();
+    };
+
+    @Override
+    public void map(String key, WebPage page, Context context)
+        throws IOException, InterruptedException {
+      ParseStatus pstatus = page.getParseStatus();
+      if (pstatus == null || !ParseStatusUtils.isSuccess(pstatus)
+          || pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
+        return; // filter urls not parsed
+      }
+
+      Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page);
+      if (!batchId.equals(REINDEX)) {
+        if (!NutchJob.shouldProcess(mark, batchId)) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
+                + "; different batch id (" + mark + ")");
+          }
+          return;
+        }
+      }
+
+      NutchDocument doc = indexUtil.index(key, page);
+      if (doc == null) {
+        return;
+      }
+      if (mark != null) {
+        Mark.INDEX_MARK.putMark(page, Mark.UPDATEDB_MARK.checkMark(page));
+        store.put(key, page);
+      }
+      context.write(key, doc);
+      context.getCounter("IndexerJob", "DocumentCount").increment(1);
+    }
+  }
+
+  private static Collection<WebPage.Field> getFields(Job job) {
+    Configuration conf = job.getConfiguration();
+    Collection<WebPage.Field> columns = new HashSet<WebPage.Field>(FIELDS);
+    IndexingFilters filters = new IndexingFilters(conf);
+    columns.addAll(filters.getFields());
+    ScoringFilters scoringFilters = new ScoringFilters(conf);
+    columns.addAll(scoringFilters.getFields());
+    return columns;
+  }
+
+  @Override
+  public Map<String, Object> run(Map<String, Object> args) throws Exception {
+    String batchId = (String) args.get(Nutch.ARG_BATCH);
+
+    Configuration conf = getConf();
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
+    Job job = new NutchJob(conf, "Indexer");
+    // TODO: Figure out why this needs to be here
+    job.getConfiguration().setClass("mapred.output.key.comparator.class",
+        StringComparator.class, RawComparator.class);
+
+    Collection<WebPage.Field> fields = getFields(job);
+    StorageUtils.initMapperJob(job, fields, String.class, NutchDocument.class,
+        IndexerMapper.class);
+    job.setNumReduceTasks(0);
+    job.setOutputFormatClass(IndexerOutputFormat.class);
+
+    job.waitForCompletion(true);
+    ToolUtil.recordJobStatus(null, job, results);
+    return results;
+  }
+
+  public void index(String batchId) throws Exception {
+    LOG.info("IndexingJob: starting");
+
+    run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId));
+    // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
+    // do the commits once and for all the reducers in one go
+    // getConf().set(SolrConstants.SERVER_URL,solrUrl);
+
+    IndexWriters writers = new IndexWriters(getConf());
+    LOG.info(writers.describe());
+    
+    writers.open(getConf());
+    if (getConf().getBoolean(SolrConstants.COMMIT_INDEX, true)) {
+      writers.commit();
+    }
+    LOG.info("IndexingJob: done.");
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 1) {
+      System.err
+          .println("Usage: IndexingJob (<batchId> | -all | -reindex) [-crawlId <id>]");
+      return -1;
+    }
+
+    if (args.length == 3 && "-crawlId".equals(args[1])) {
+      getConf().set(Nutch.CRAWL_ID_KEY, args[2]);
+    }
+    try {
+      index(args[0]);
+      return 0;
+    } catch (final Exception e) {
+      LOG.error("SolrIndexerJob: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    final int res = ToolRunner.run(NutchConfiguration.create(),
+        new IndexingJob(), args);
+    System.exit(res);
+  }
+}

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Wed Jan 15 12:01:45 2014
@@ -45,7 +45,7 @@ import org.apache.nutch.util.NutchConfig
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocument;
@@ -241,7 +241,7 @@ implements Tool {
     throws IOException, InterruptedException {
       Configuration conf = context.getConfiguration();
       int numSplits = context.getNumReduceTasks();
-      SolrServer solr = SolrUtils.getCommonsHttpSolrServer(conf);
+      SolrServer solr = SolrUtils.getHttpSolrServer(conf);
 
       final SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
       solrQuery.setFields(SolrConstants.ID_FIELD);
@@ -271,7 +271,7 @@ implements Tool {
     public RecordReader<Text, SolrRecord> createRecordReader(InputSplit split,
         TaskAttemptContext context) throws IOException, InterruptedException {
       Configuration conf = context.getConfiguration();
-      SolrServer solr = SolrUtils.getCommonsHttpSolrServer(conf);
+      SolrServer solr = SolrUtils.getHttpSolrServer(conf);
       SolrInputSplit solrSplit = (SolrInputSplit) split;
       final int numDocs = (int) solrSplit.getLength();
       
@@ -315,11 +315,7 @@ implements Tool {
   @Override
   public void setup(Context job) throws IOException {
     Configuration conf = job.getConfiguration();
-    try {
-      solr = SolrUtils.getCommonsHttpSolrServer(conf);
-    } catch (MalformedURLException e) {
-      throw new IOException(e);
-    }
+    solr = SolrUtils.getHttpSolrServer(conf);
   }
 
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java Wed Jan 15 12:01:45 2014
@@ -1,23 +1,25 @@
 package org.apache.nutch.indexer.solr;
 
 
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.auth.AuthScope;
-import org.apache.commons.httpclient.UsernamePasswordCredentials;
-import org.apache.commons.httpclient.params.HttpClientParams;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.params.HttpClientParams;
+import org.apache.http.params.HttpParams;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.nutch.indexer.solr.SolrConstants;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
 
 import java.net.MalformedURLException;
 
 public class SolrUtils {
 
-  public static Logger LOG = LoggerFactory.getLogger(SolrIndexerJob.class);
+  public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
 
-  public static CommonsHttpSolrServer getCommonsHttpSolrServer(Configuration job) throws MalformedURLException {
-    HttpClient client=new HttpClient();
+  public static HttpSolrServer getHttpSolrServer(Configuration job) throws MalformedURLException {
+    DefaultHttpClient client = new DefaultHttpClient();
 
     // Check for username/password
     if (job.getBoolean(SolrConstants.USE_AUTH, false)) {
@@ -27,15 +29,16 @@ public class SolrUtils {
 
       AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
 
-      client.getState().setCredentials(scope, new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
+      client.getCredentialsProvider().setCredentials(scope, 
+          new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
 
-      HttpClientParams params = client.getParams();
-      params.setAuthenticationPreemptive(true);
+      HttpParams params = client.getParams();
+      HttpClientParams.setAuthenticating(params, true);
 
       client.setParams(params);
     }
 
-    return new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL), client);
+    return new HttpSolrServer(job.get(SolrConstants.SERVER_URL), client);
   }
 
   public static String stripNonCharCodepoints(String input) {

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1558349&r1=1558348&r2=1558349&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Wed Jan 15 12:01:45 2014
@@ -30,6 +30,7 @@
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
+     <ant dir="indexer-solr" target="deploy"/>
      <ant dir="language-identifier" target="deploy"/>
      <ant dir="lib-http" target="deploy"/>
      <ant dir="lib-nekohtml" target="deploy"/>
@@ -110,6 +111,7 @@
     <ant dir="index-anchor" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-more" target="clean"/>
+    <ant dir="indexer-solr" target="clean"/>
     <ant dir="language-identifier" target="clean"/>
     <ant dir="lib-http" target="clean"/>
     <ant dir="lib-nekohtml" target="clean"/>

Added: nutch/branches/2.x/src/plugin/indexer-solr/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/build.xml?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/build.xml (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/build.xml Wed Jan 15 12:01:45 2014
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-solr" default="jar-core">
+
+  <import file="../build-plugin.xml" />
+
+</project>

Added: nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/ivy.xml Wed Jan 15 12:01:45 2014
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+   <dependency org="org.apache.solr" name="solr-solrj" rev="4.6.0"
+		conf="*->default"/>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/plugin.xml Wed Jan 15 12:01:45 2014
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="indexer-solr" name="SOLRIndexWriter" version="1.0.0"
+  provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="indexer-solr.jar">
+      <export name="*" />
+    </library>
+
+     <library name="activation-1.1.jar"/>
+     <!--library name="commons-codec-1.4.jar"/-->
+     <library name="httpclient-4.2.5.jar"/>
+     <!--library name="commons-io-1.4.jar"/-->
+     <library name="commons-logging-1.1.1.jar"/>
+     <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
+     <library name="jline-0.9.1.jar"/>
+     <library name="log4j-1.2.15.jar"/>
+     <!--library name="lucene-core-4.4.0.jar"/-->
+     <library name="mail-1.4.1.jar"/>
+     <!--library name="slf4j-api-1.6.1.jar"/-->
+     <library name="solr-solrj-4.6.0.jar"/>
+     <library name="stax-api-1.0.1.jar"/>
+     <library name="wstx-asl-3.2.7.jar"/>
+     <!--library name="zookeeper-3.3.1.jar"/-->
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints" />
+  </requires>
+
+  <extension id="org.apache.nutch.indexwriter.solr"
+    name="SOLR Index Writer"
+    point="org.apache.nutch.indexer.IndexWriter">
+    <implementation id="SOLRIndexWriter"
+      class="org.apache.nutch.indexwriter.solr.SolrIndexWriter" />
+  </extension>
+
+</plugin>

Added: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+public interface SolrConstants {
+  public static final String SOLR_PREFIX = "solr.";
+
+  public static final String SERVER_URL = SOLR_PREFIX + "server.url";
+
+  public static final String COMMIT_SIZE = SOLR_PREFIX + "commit.size";
+  
+  public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
+
+  public static final String MAPPING_FILE = SOLR_PREFIX + "mapping.file";
+
+  public static final String USE_AUTH = SOLR_PREFIX + "auth";
+
+  public static final String USERNAME = SOLR_PREFIX + "auth.username";
+
+  public static final String PASSWORD = SOLR_PREFIX + "auth.password";
+  
+  public static final String ID_FIELD = "id";
+  
+  public static final String URL_FIELD = "url";
+  
+  public static final String BOOST_FIELD = "boost";
+  
+  public static final String TIMESTAMP_FIELD = "tstamp";
+  
+  public static final String DIGEST_FIELD = "digest";
+
+}

Added: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1558349&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (added)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Wed Jan 15 12:01:45 2014
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.common.SolrInputDocument;
+
+public class SolrIndexWriter implements IndexWriter {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(SolrIndexWriter.class);
+
+  private HttpSolrServer solr;
+  private SolrMappingReader solrMapping;
+
+  private Configuration config;
+
+  private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>();
+
+  private int batchSize;
+  private int numDeletes = 0;
+  private boolean delete = false;
+
+  protected static long documentCount = 0;
+
+  @Override
+  public void open(Configuration conf) throws IOException {
+    solr = SolrUtils.getHttpSolrServer(conf);
+    batchSize = conf.getInt(SolrConstants.COMMIT_SIZE, 1000);
+    solrMapping = SolrMappingReader.getInstance(conf);
+  }
+
+  @Override
+  public void write(NutchDocument doc) throws IOException {
+    final SolrInputDocument inputDoc = new SolrInputDocument();
+    for (final Entry<String, List<String>> e : doc) {
+      for (final String val : e.getValue()) {
+
+        Object val2 = val;
+        if (e.getKey().equals("content") || e.getKey().equals("title")) {
+          val2 = SolrUtils.stripNonCharCodepoints(val);
+        }
+
+        inputDoc.addField(solrMapping.mapKey(e.getKey()), val2);
+        String sCopy = solrMapping.mapCopyKey(e.getKey());
+        if (sCopy != e.getKey()) {
+          inputDoc.addField(sCopy, val2);
+        }
+      }
+    }
+    inputDoc.setDocumentBoost(doc.getScore());
+    inputDocs.add(inputDoc);
+    documentCount++;
+    if (inputDocs.size() >= batchSize) {
+      try {
+        LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents");
+        solr.add(inputDocs);
+      } catch (final SolrServerException e) {
+        throw new IOException(e);
+      }
+      inputDocs.clear();
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    try {
+      if (!inputDocs.isEmpty()) {
+        LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents");
+        solr.add(inputDocs);
+        inputDocs.clear();
+      } else if (numDeletes > 0) {
+        LOG.info("Deleted " + Integer.toString(numDeletes) + " documents");
+      }
+    } catch (final SolrServerException e) {
+      throw new IOException(e);
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+    String serverURL = conf.get(SolrConstants.SERVER_URL);
+    if (serverURL == null) {
+      String message = "Missing SOLR URL. Should be set via -D "
+          + SolrConstants.SERVER_URL;
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+  }
+
+  @Override
+  public void delete(String key) throws IOException {
+    if (delete) {
+      try {
+        solr.deleteById(key);
+        numDeletes++;
+      } catch (final SolrServerException e) {
+        throw makeIOException(e);
+      }
+    }
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    write(doc);
+  }
+
+  @Override
+  public void commit() throws IOException {
+    try {
+      solr.commit();
+      LOG.info("Total " + documentCount + (documentCount > 1 ? " documents are " : " document is ") + "added.");
+    } catch (SolrServerException e) {
+      throw makeIOException(e);
+    }
+  }
+
+  public static IOException makeIOException(SolrServerException e) {
+    final IOException ioe = new IOException();
+    ioe.initCause(e);
+    return ioe;
+  }
+
+  @Override
+  public String describe() {
+    StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
+    sb.append("\t").append(SolrConstants.SERVER_URL)
+        .append(" : URL of the SOLR instance (mandatory)\n");
+    sb.append("\t").append(SolrConstants.COMMIT_SIZE)
+        .append(" : buffer size when sending to SOLR (default 1000)\n");
+    sb.append("\t")
+        .append(SolrConstants.MAPPING_FILE)
+        .append(
+            " : name of the mapping file for fields (default solrindex-mapping.xml)\n");
+    sb.append("\t").append(SolrConstants.USE_AUTH)
+        .append(" : use authentication (default false)\n");
+    sb.append("\t").append(SolrConstants.USERNAME)
+        .append(" : use authentication (default false)\n");
+    sb.append("\t").append(SolrConstants.USE_AUTH)
+        .append(" : username for authentication\n");
+    sb.append("\t").append(SolrConstants.PASSWORD)
+        .append(" : password for authentication\n");
+    return sb.toString();
+  }
+
+}