You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/11/01 17:42:32 UTC
svn commit: r1635988 - in /nutch/branches/2.x: ./ conf/ ivy/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/storage/

Author: lewismc
Date: Sat Nov  1 16:42:32 2014
New Revision: 1635988

URL: http://svn.apache.org/r1635988
Log:
Upgrade to Gora 0.5

Added:
    nutch/branches/2.x/conf/gora-mongodb-mapping.xml
    nutch/branches/2.x/conf/gora-solr-host-schema.xml
    nutch/branches/2.x/conf/gora-solr-mapping.xml
    nutch/branches/2.x/conf/gora-solr-webpage-schema.xml
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/gora.properties
    nutch/branches/2.x/ivy/ivy.xml
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov  1 16:42:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib)
+
 * NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel)
 
 * NUTCH-1882 ant eclipse target to add output path to src/test (snagel)

Added: nutch/branches/2.x/conf/gora-mongodb-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-mongodb-mapping.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-mongodb-mapping.xml (added)
+++ nutch/branches/2.x/conf/gora-mongodb-mapping.xml Sat Nov  1 16:42:32 2014
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!--
+  More information on gora-mongodb configuration and mapping's can be found
+  at http://gora.apache.org/current/gora-mongodb.html
+-->
+<gora-otd>
+
+    <class name="org.apache.nutch.storage.WebPage" keyClass="java.lang.String" document="webpage">
+        <!-- fetch fields -->
+        <field name="baseUrl" docfield="baseUrl" type="string"/>
+        <field name="status" docfield="status" type="int32"/>
+        <field name="prevFetchTime" docfield="prevFetchTime" type="int64"/>
+        <field name="fetchTime" docfield="fetchTime" type="int64"/>
+        <field name="fetchInterval" docfield="fetchInterval" type="int64"/>
+        <field name="retriesSinceFetch" docfield="retriesSinceFetch" type="int32"/>
+        <field name="reprUrl" docfield="reprUrl" type="string"/>
+        <field name="content" docfield="content" type="binary"/>
+        <field name="contentType" docfield="contentType" type="string"/>
+        <field name="modifiedTime" docfield="modifiedTime" type="int64"/>
+        <field name="prevModifiedTime" docfield="prevModifiedTime" type="int64"/>
+        <field name="batchId" docfield="batchId" type="string"/>
+        
+        <!-- parse fields -->
+        <field name="title" docfield="title" type="string"/>
+        <field name="text" docfield="text" type="binary"/>
+        <field name="signature" docfield="signature" type="string"/>
+        <field name="prevSignature" docfield="prevSignature" type="string"/>
+        <!-- score fields -->
+        <field name="score" docfield="score" type="int32"/>
+        <field name="headers" docfield="headers" type="document"/>
+        <field name="inlinks" docfield="inlinks" type="document"/>
+        <field name="outlinks" docfield="outlinks" type="document"/>
+        <field name="metadata" docfield="metadata" type="document"/>
+        <field name="markers" docfield="markers" type="document"/>
+        <field name="parseStatus" docfield="parseStatus" type="document"/>
+        <field name="protocolStatus" docfield="protocolStatus" type="document"/>
+    </class>
+    
+    <class name="org.apache.nutch.storage.Host" keyClass="java.lang.String" document="host">
+        <field name="metadata" docfield="metadata" type="document"/>
+        <field name="inlinks" docfield="inlinks" type="document"/>
+        <field name="outlinks" docfield="outlinks" type="document"/>
+    </class>
+
+</gora-otd>

Added: nutch/branches/2.x/conf/gora-solr-host-schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-host-schema.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-host-schema.xml (added)
+++ nutch/branches/2.x/conf/gora-solr-host-schema.xml Sat Nov  1 16:42:32 2014
@@ -0,0 +1,331 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+    Description: 
+    This document contains Solr 4.x schema definition to enable Gora-Solr 
+    integration currently built into Nutch 2.3 onwards.
+    This schema is not minimal, there are some useful field type definitions left,
+    and the set of fields and their flags (indexed/stored/term vectors) can be
+    further optimized depending on needs.  See
+    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+    for more Solr specific info.
+    See 
+    http://gora.apache.org/current/gora-solr.html 
+    for gora-solr specific documentation.
+-->
+
+<schema name="nutch_gora_host_solr" version="1.5">
+
+  <types>
+
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+
+    <!--
+      Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+    -->
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+    <!--
+     Numeric field types that index each value at various levels of precision
+     to accelerate range queries when the number of values between the range
+     endpoints is large. See the javadoc for NumericRangeQuery for internal
+     implementation details.
+
+     Smaller precisionStep values (specified in bits) will lead to more tokens
+     indexed per value, slightly larger index size, and faster range queries.
+     A precisionStep of 0 disables indexing at different precision levels.
+    -->
+    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+         is a more restricted form of the canonical representation of dateTime
+         http://www.w3.org/TR/xmlschema-2/#dateTime    
+         The trailing "Z" designates UTC time and is mandatory.
+         Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+         All other components are mandatory.
+
+         Expressions can also be used to denote calculations that should be
+         performed relative to "NOW" to determine the value, ie...
+
+               NOW/HOUR
+                  ... Round to the start of the current hour
+               NOW-1DAY
+                  ... Exactly 1 day prior to now
+               NOW/DAY+6MONTHS+3DAYS
+                  ... 6 months and 3 days in the future from the start of
+                      the current day
+                      
+         Consult the DateField javadocs for more information.
+
+         Note: For faster range queries, consider the tdate type
+      -->
+    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+
+    <!-- A Trie based date field for faster date range queries and date faceting. -->
+    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+
+    <!-- solr.TextField allows the specification of custom text analyzers
+         specified as a tokenizer and a list of token filters. Different
+         analyzers may be specified for indexing and querying.
+
+         The optional positionIncrementGap puts space between multiple fields of
+         this type on the same document, with the purpose of preventing false phrase
+         matching across fields.
+
+         For more info on customizing your analyzer chain, please see
+         http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+     -->
+
+    <!-- A general text field that has reasonable, generic
+         cross-language defaults: it tokenizes with StandardTokenizer,
+     removes stop words from case-insensitive "stopwords.txt"
+     (empty by default), and down cases.  At query time only, it
+     also applies synonyms. -->
+    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English: it
+         tokenizes with StandardTokenizer, removes English stop words
+         (stopwords.txt), down cases, protects words from protwords.txt, and
+         finally applies Porter's stemming.  The query time analyzer
+         also applies synonyms from synonyms.txt. -->
+    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+    <filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+    <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+    -->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+    <filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+    <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+    -->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English, plus
+     aggressive word-splitting and autophrase features enabled.
+     This field is just like text_en, except it adds
+     WordDelimiterFilter to enable splitting and matching of
+     words on case-change, alpha numeric boundaries, and
+     non-alphanumeric chars.  This means certain compound word
+     cases will work, for example query "wi fi" will match
+     document "WiFi" or "wi-fi".  However, other cases will still
+     not match, for example if the query is "wifi" and the
+     document is "wi fi" or if the query is "wi-fi" and the
+     document is "wifi".
+        -->
+    <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
+         but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
+    <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+             possible with WordDelimiterFilter in conjuncton with stemming. -->
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Just like text_general except it reverses the characters of
+     each token, to enable more efficient leading wildcard queries. -->
+    <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype>
+
+    <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!--
+        The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+        a token of "foo|1.4"  would be indexed as "foo" with a payload of 1.4f
+        Attributes of the DelimitedPayloadTokenFilterFactory : 
+         "delimiter" - a one character delimiter. Default is | (pipe)
+     "encoder" - how to encode the following value into a playload
+        float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+        integer -> o.a.l.a.p.IntegerEncoder
+        identity -> o.a.l.a.p.IdentityEncoder
+            Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+         -->
+        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+      </analyzer>
+    </fieldtype>
+
+    <!-- lowercases the entire field value, keeping it as a single token.  -->
+    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory" />
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+           <filter class="solr.LowerCaseFilterFactory"/>
+           <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
+      </analyzer>
+    </fieldType>
+
+
+    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- since fields of this type are by default not stored or indexed,
+         any data added to them will be ignored outright.  --> 
+    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+ </types>
+
+ <fields>
+    <!-- This field is used internally by Solr, for example by features 
+    like partial update functionality and update log. It is NOT required
+    if updateLog is turned off in your updateHandler, however it is advised
+    to include it as performance improvements are minimal. -->
+    <field name="_version_" type="long" indexed="true" stored="true"/>
+    
+    <field name="metadata" type="string" stored="true" indexed="true" required="true"/>
+
+    <!-- fetch fields -->
+    <field name="inlinks" type="string" stored="true" indexed="false"/>
+    <field name="outlinks" type="string" stored="true" indexed="false"/>
+
+ </fields>
+ <uniqueKey>metadata</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+  <!-- copyField commands copy one field to another at the time a document
+        is added to the index.  It's used either to index the same field differently,
+        or to add multiple fields to the same field for easier/faster searching.  -->
+
+ <!--  copyField source="" dest=""/-->
+
+</schema>

Added: nutch/branches/2.x/conf/gora-solr-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-mapping.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-mapping.xml (added)
+++ nutch/branches/2.x/conf/gora-solr-mapping.xml Sat Nov  1 16:42:32 2014
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<!--
+  More information on gora-solr configuration and mapping's can be found
+  at http://gora.apache.org/current/gora-solr.html
+-->
+<gora-otd>
+
+    <class name="org.apache.nutch.storage.WebPage" keyClass="java.lang.String" table="webpage">
+        <!-- fetch fields -->
+        <primarykey column="baseUrl"/>
+        <field name="status" column="status"/>
+        <field name="prevFetchTime" column="prevFetchTime" />
+        <field name="fetchTime" column="fetchTime" />
+        <field name="fetchInterval" column="fetchInterval"/>
+        <field name="retriesSinceFetch" column="retriesSinceFetch"/>
+        <field name="reprUrl" column="reprUrl"/>
+        <field name="content" column="content"/>
+        <field name="contentType" column="contentType"/>
+        <field name="modifiedTime" column="modifiedTime"/>
+        <field name="prevModifiedTime" column="prevModifiedTime" />
+        <field name="batchId" column="batchId" />
+        
+        <!-- parse fields -->
+        <field name="title" column="title" />
+        <field name="text" column="text" />
+        <field name="signature" column="signature" />
+        <field name="prevSignature" column="prevSignature"/>
+        <!-- score fields -->
+        <field name="score" column="score"/>
+        <field name="headers" column="headers"/>
+        <field name="inlinks" column="inlinks" />
+        <field name="outlinks" column="outlinks"/>
+        <field name="metadata" column="metadata" />
+        <field name="markers" column="markers" />
+        <field name="parseStatus" column="parseStatus"/>
+        <field name="protocolStatus" column="protocolStatus" />
+    </class>
+    
+    <class name="org.apache.nutch.storage.Host" keyClass="java.lang.String" table="host">
+        <field name="metadata" column="metadata" />
+        <field name="inlinks" column="inlinks" />
+        <field name="outlinks" column="outlinks" />
+    </class>
+</gora-otd>
+

Added: nutch/branches/2.x/conf/gora-solr-webpage-schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-webpage-schema.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-webpage-schema.xml (added)
+++ nutch/branches/2.x/conf/gora-solr-webpage-schema.xml Sat Nov  1 16:42:32 2014
@@ -0,0 +1,359 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+    Description: 
+    This document contains Solr 4.x schema definition to enable Gora-Solr 
+    integration currently built into Nutch 2.3 onwards.
+    This schema is not minimal, there are some useful field type definitions left,
+    and the set of fields and their flags (indexed/stored/term vectors) can be
+    further optimized depending on needs.  See
+    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+    for more Solr specific info.
+    See 
+    http://gora.apache.org/current/gora-solr.html 
+    for gora-solr specific documentation.
+-->
+
+<schema name="nutch_gora_webpage_solr" version="1.5">
+
+  <types>
+
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+
+    <!--
+      Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+    -->
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+    <!--
+     Numeric field types that index each value at various levels of precision
+     to accelerate range queries when the number of values between the range
+     endpoints is large. See the javadoc for NumericRangeQuery for internal
+     implementation details.
+
+     Smaller precisionStep values (specified in bits) will lead to more tokens
+     indexed per value, slightly larger index size, and faster range queries.
+     A precisionStep of 0 disables indexing at different precision levels.
+    -->
+    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+         is a more restricted form of the canonical representation of dateTime
+         http://www.w3.org/TR/xmlschema-2/#dateTime    
+         The trailing "Z" designates UTC time and is mandatory.
+         Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+         All other components are mandatory.
+
+         Expressions can also be used to denote calculations that should be
+         performed relative to "NOW" to determine the value, ie...
+
+               NOW/HOUR
+                  ... Round to the start of the current hour
+               NOW-1DAY
+                  ... Exactly 1 day prior to now
+               NOW/DAY+6MONTHS+3DAYS
+                  ... 6 months and 3 days in the future from the start of
+                      the current day
+                      
+         Consult the DateField javadocs for more information.
+
+         Note: For faster range queries, consider the tdate type
+      -->
+    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+
+    <!-- A Trie based date field for faster date range queries and date faceting. -->
+    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+
+    <!-- solr.TextField allows the specification of custom text analyzers
+         specified as a tokenizer and a list of token filters. Different
+         analyzers may be specified for indexing and querying.
+
+         The optional positionIncrementGap puts space between multiple fields of
+         this type on the same document, with the purpose of preventing false phrase
+         matching across fields.
+
+         For more info on customizing your analyzer chain, please see
+         http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+     -->
+
+    <!-- A general text field that has reasonable, generic
+         cross-language defaults: it tokenizes with StandardTokenizer,
+     removes stop words from case-insensitive "stopwords.txt"
+     (empty by default), and down cases.  At query time only, it
+     also applies synonyms. -->
+    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English: it
+         tokenizes with StandardTokenizer, removes English stop words
+         (stopwords.txt), down cases, protects words from protwords.txt, and
+         finally applies Porter's stemming.  The query time analyzer
+         also applies synonyms from synonyms.txt. -->
+    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+    <filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+    <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+    -->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+    <filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+    <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+    -->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English, plus
+     aggressive word-splitting and autophrase features enabled.
+     This field is just like text_en, except it adds
+     WordDelimiterFilter to enable splitting and matching of
+     words on case-change, alpha numeric boundaries, and
+     non-alphanumeric chars.  This means certain compound word
+     cases will work, for example query "wi fi" will match
+     document "WiFi" or "wi-fi".  However, other cases will still
+     not match, for example if the query is "wifi" and the
+     document is "wi fi" or if the query is "wi-fi" and the
+     document is "wifi".
+        -->
+    <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
+         but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
+    <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+             possible with WordDelimiterFilter in conjuncton with stemming. -->
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Just like text_general except it reverses the characters of
+     each token, to enable more efficient leading wildcard queries. -->
+    <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype>
+
+    <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!--
+        The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+        a token of "foo|1.4"  would be indexed as "foo" with a payload of 1.4f
+        Attributes of the DelimitedPayloadTokenFilterFactory : 
+         "delimiter" - a one character delimiter. Default is | (pipe)
+     "encoder" - how to encode the following value into a playload
+        float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+        integer -> o.a.l.a.p.IntegerEncoder
+        identity -> o.a.l.a.p.IdentityEncoder
+            Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+         -->
+        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+      </analyzer>
+    </fieldtype>
+
+    <!-- lowercases the entire field value, keeping it as a single token.  -->
+    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory" />
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+           <filter class="solr.LowerCaseFilterFactory"/>
+           <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
+      </analyzer>
+    </fieldType>
+
+
+    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- since fields of this type are by default not stored or indexed,
+         any data added to them will be ignored outright.  --> 
+    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+ </types>
+
+ <fields>
+    <!-- This field is used internally by Solr, for example by features 
+    like partial update functionality and update log. It is NOT required
+    if updateLog is turned off in your updateHandler, however it is advised
+    to include it as performance improvements are minimal. -->
+    <field name="_version_" type="long" indexed="true" stored="true"/>
+    
+    <field name="baseUrl" type="string" stored="true" indexed="true" required="true"/>
+
+    <!-- fetch fields -->
+    <field name="status" type="string" stored="true" indexed="false"/>
+    <field name="prevFetchTime" type="string" stored="true" indexed="false"/>
+    <field name="fetchTime" type="float" stored="true" indexed="false"/>
+    <field name="fetchInterval" type="url" stored="false" indexed="true"/>
+    <field name="retriesSinceFetch" type="url" stored="true" indexed="true"/>
+    <field name="contentType" type="url" stored="true" indexed="true" />
+    <field name="reprUrl" type="url" stored="true" indexed="true" />
+    <field name="signature" type="url" stored="true" indexed="true" />
+    <field name="prevSignature" type="url" stored="true" indexed="true" />
+    <field name="modifiedTime" type="url" stored="true" indexed="true" />
+    <field name="prevModifiedTime" type="url" stored="true" indexed="true" />
+    <field name="batchId" type="url" stored="true" indexed="true" />
+    <field name="reprUrl" type="url" stored="true" indexed="true" />
+
+    <!-- stored=true for highlighting, use term vectors  and positions for fast highlighting -->
+    <field name="content" type="text_general" stored="true" indexed="true"/>
+    <field name="title" type="text_general" stored="true" indexed="true" multiValued="true"/>
+
+     <!-- score fields -->
+     <field name="score" type="url" stored="true" indexed="true" />
+     <field name="headers" type="url" stored="true" indexed="true" />
+     <field name="inlinks" type="url" stored="true" indexed="true" />
+     <field name="outlinks" type="url" stored="true" indexed="true" />
+     <field name="metadata" type="url" stored="true" indexed="true" />
+     <field name="markers" type="url" stored="true" indexed="true" />
+     <field name="parseStatus" type="url" stored="true" indexed="true" />
+     <field name="protocolStatus" type="url" stored="true" indexed="true" />
+
+ </fields>
+ <uniqueKey>baseUrl</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+  <!-- copyField commands copy one field to another at the time a document
+        is added to the index.  It's used either to index the same field differently,
+        or to add multiple fields to the same field for easier/faster searching.  -->
+
+ <copyField source="content" dest="text"/>
+ <copyField source="url" dest="text"/>
+ <copyField source="title" dest="text"/>
+
+
+</schema>

Modified: nutch/branches/2.x/conf/gora.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora.properties?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora.properties (original)
+++ nutch/branches/2.x/conf/gora.properties Sat Nov  1 16:42:32 2014
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# MORE DOCUMENTATION ON ALL OF THESE PROPERTIES CAN BE FOUND AT
+# http://gora.apache.org/current/
+
 #gora.datastore.default=org.apache.gora.mock.store.MockDataStore
 #gora.datastore.autocreateschema=true
 
@@ -83,3 +86,25 @@
 #gora.datastore.accumulo.user=root
 #gora.datastore.accumulo.password=secret
 
+############################
+# SolrStore properties     #
+############################
+#gora.datastore.default=org.apache.gora.solr.store.SolrStore
+#gora.solrstore.solr.url=http://localhost:9876/solr
+#gora.solrstore.solr.config=solrconfig.xml
+#gora.solrstore.solr.schema=gora-solr-schema.xml
+#gora.solrstore.solr.batchSize=100
+#gora.solrstore.solr.solrjserver=http
+#gora.solrstore.solr.commitWithin=1000
+#gora.solrstore.solr.resultsSize=100
+
+############################
+# MongoDBStore properties  #
+############################
+#gora.datastore.default=org.apache.gora.mongodb.store.MongoStore
+#gora.mongodb.override_hadoop_configuration=false
+#gora.mongodb.mapping.file=/gora-mongodb-mapping.xml
+#gora.mongodb.servers=localhost:27017
+#gora.mongodb.db=mytestdatabase
+#gora.mongodb.login=login
+#gora.mongodb.secret=secret

Modified: nutch/branches/2.x/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Sat Nov  1 16:42:32 2014
@@ -103,7 +103,7 @@
     <!-- N.B. To use Gora SNAPSHOT's merely replace the 'ref' value with the SNAPSHOT version 
     and add changing="true" alongside the dependency declaration. An example has been
     provided for the gora-core dependency as below -->
-    <dependency org="org.apache.gora" name="gora-core" rev="0.4" conf="*->default"/>
+    <dependency org="org.apache.gora" name="gora-core" rev="0.5" conf="*->default"/>
     
     <!-- Uncomment this to use SQL as Gora backend. It should be noted that the 
     gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core 0.3. Users should 
@@ -116,35 +116,43 @@
     <dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/> 
     -->
     <!-- Uncomment this to use HBase as Gora backend. -->
-    <!--
-    <dependency org="org.apache.gora" name="gora-hbase" rev="0.4" conf="*->default" />
+    <!--     
+    <dependency org="org.apache.gora" name="gora-hbase" rev="0.5" conf="*->default" /> 
     -->
     <!-- Uncomment this to use Accumulo as Gora backend. -->
     <!--
-    <dependency org="org.apache.gora" name="gora-accumulo" rev="0.4" conf="*->default" />
+    <dependency org="org.apache.gora" name="gora-accumulo" rev="0.5" conf="*->default" />
     -->
     <!-- Uncomment this to use Cassandra as Gora backend. -->
     <!-- 
-    <dependency org="org.apache.gora" name="gora-cassandra" rev="0.4" conf="*->default" />
+    <dependency org="org.apache.gora" name="gora-cassandra" rev="0.5" conf="*->default" />
     -->
-    
-	<!-- web app dependencies -->
+    <!-- Uncomment this to use MongoDB as Gora backend. -->
+    <!--
+    <dependency org="org.apache.gora" name="gora-mongodb" rev="0.5" conf="*->default" />
+    -->    
+    <!-- Uncomment this to use Solr as Gora backend. -->
+    <!--
+    <dependency org="org.apache.gora" name="gora-solr" rev="0.5" conf="*->default" />
+    -->
+
+    <!-- web app dependencies -->
 
-	<dependency org="org.apache.commons" name="commons-collections4" rev="4.0" conf="*->default" />
-	<dependency org="org.springframework" name="spring-core" rev="4.0.4.RELEASE" conf="*->default" />
-	<dependency org="org.springframework" name="spring-context" rev="4.0.4.RELEASE" conf="*->default" />
-	<dependency org="org.springframework" name="spring-web" rev="4.0.4.RELEASE" conf="*->default" />
+    <dependency org="org.apache.commons" name="commons-collections4" rev="4.0" conf="*->default" />
+    <dependency org="org.springframework" name="spring-core" rev="4.0.4.RELEASE" conf="*->default" />
+    <dependency org="org.springframework" name="spring-context" rev="4.0.4.RELEASE" conf="*->default" />
+    <dependency org="org.springframework" name="spring-web" rev="4.0.4.RELEASE" conf="*->default" />
 
-	<dependency org="com.sun.jersey" name="jersey-client" rev="1.8" conf="*->default" />
+    <dependency org="com.sun.jersey" name="jersey-client" rev="1.8" conf="*->default" />
 	
-	<dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="4.48" conf="*->default" />
-	<dependency org="com.h2database" name="h2" rev="1.4.180" conf="*->default" />
-	<dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.0.0" conf="*->default" />
+    <dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="4.48" conf="*->default" />
+    <dependency org="com.h2database" name="h2" rev="1.4.180" conf="*->default" />
+    <dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.0.0" conf="*->default" />
 	
-	<dependency org="org.apache.wicket" name="wicket-core" rev="6.16.0" conf="*->default" />
-	<dependency org="org.apache.wicket" name="wicket-spring" rev="6.16.0" conf="*->default" />
-	<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-core" rev="0.9.2" conf="*->default" />
-	<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-extensions" rev="0.9.2" conf="*->default" />
+    <dependency org="org.apache.wicket" name="wicket-core" rev="6.16.0" conf="*->default" />
+    <dependency org="org.apache.wicket" name="wicket-spring" rev="6.16.0" conf="*->default" />
+    <dependency org="de.agilecoders.wicket" name="wicket-bootstrap-core" rev="0.9.2" conf="*->default" />
+    <dependency org="de.agilecoders.wicket" name="wicket-bootstrap-extensions" rev="0.9.2" conf="*->default" />
 
     <!--global exclusion -->
     <exclude module="ant" />

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Sat Nov  1 16:42:32 2014
@@ -16,7 +16,6 @@
  ******************************************************************************/
 package org.apache.nutch.crawl;
 
-import org.apache.avro.util.Utf8;
 import org.apache.gora.mapreduce.GoraMapper;
 import org.apache.gora.query.Query;
 import org.apache.gora.query.Result;
@@ -371,7 +370,15 @@ public class WebTableReader extends Nutc
     sb.append("score:\t" + page.getScore()).append("\n");
 
     Map<CharSequence, CharSequence> markers = page.getMarkers();
-    sb.append("markers:\t" + markers).append("\n");
+    if (markers != null) {
+      Iterator<Entry<CharSequence, CharSequence>> iterator = markers.entrySet()
+          .iterator();
+      while (iterator.hasNext()) {
+        Entry<CharSequence, CharSequence> entry = iterator.next();
+        sb.append("marker " + entry.getKey().toString()).append(" : \t")
+            .append(entry.getValue()).append("\n");
+      }
+    }
     sb.append("reprUrl:\t" + page.getReprUrl()).append("\n");
     CharSequence batchId = page.getBatchId();
     if (batchId != null) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Sat Nov  1 16:42:32 2014
@@ -104,9 +104,9 @@ public class ParserChecker implements To
     ProtocolFactory factory = new ProtocolFactory(conf);
     Protocol protocol = factory.getProtocol(url);
     WebPage page = WebPage.newBuilder().build();
-    
+
     ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
-    
+
     if(!protocolOutput.getStatus().isSuccess()) {
       LOG.error("Fetch failed with protocol status: "
           + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
@@ -114,7 +114,7 @@ public class ParserChecker implements To
       return (-1);
     }
     Content content = protocolOutput.getContent();
-    
+
     if (content == null) {
       LOG.error("No content for " + url);
       return (-1);
@@ -145,10 +145,10 @@ public class ParserChecker implements To
       LOG.error("Problem with parse - check log");
       return (-1);
     }
-    
+
     // Calculate the signature
     byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);
-    
+
     if (LOG.isInfoEnabled()) {
       LOG.info("parsing: " + url);
       LOG.info("contentType: " + contentType);
@@ -167,7 +167,7 @@ public class ParserChecker implements To
       while (iterator.hasNext()) {
         Entry<CharSequence, ByteBuffer> entry = iterator.next();
         sb.append(entry.getKey().toString()).append(" : \t")
-            .append(Bytes.toString(entry.getValue())).append("\n");
+        .append(Bytes.toString(entry.getValue())).append("\n");
       }
       System.out.print(sb.toString());
     }
@@ -177,6 +177,21 @@ public class ParserChecker implements To
       sb.append("  outlink: ").append(l).append('\n');
     }
     System.out.print(sb.toString());
+    if (page.getHeaders() != null) {
+      LOG.info("---------\nHeaders\n---------\n");
+      Map<CharSequence, CharSequence> headers = page.getHeaders();
+      StringBuffer headersb = new StringBuffer();
+      if (metadata != null) {
+        Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet()
+            .iterator();
+        while (iterator.hasNext()) {
+          Entry<CharSequence, CharSequence> entry = iterator.next();
+          headersb.append(entry.getKey().toString()).append(" : \t")
+          .append(entry.getValue()).append("\n");
+        }
+        System.out.print(headersb.toString());
+      }
+    }
     if (dumpText) {
       LOG.info("---------\nParseText\n---------\n");
       System.out.print(parse.getText());

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java Sat Nov  1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
 /**
  * Autogenerated by Avro
  * 
@@ -61,6 +77,14 @@ public class Host extends org.apache.gor
   "outlinks",
   "inlinks",
   };
+  
+  /**
+   * Gets the total field count.
+   * @return int field count
+   */
+  public int getFieldsCount() {
+    return Host._ALL_FIELDS.length;
+  }  
 
   /** Bytes used to represent weather or not a field is dirty. */
   private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[1]);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java Sat Nov  1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
 /**
  * Autogenerated by Avro
  * 
@@ -57,6 +73,14 @@ public class ParseStatus extends org.apa
   "minorCode",
   "args",
   };
+  
+  /**
+   * Gets the total field count.
+   * @return int field count
+   */
+  public int getFieldsCount() {
+    return ParseStatus._ALL_FIELDS.length;
+  }
 
   /** Bytes used to represent weather or not a field is dirty. */
   private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[1]);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java Sat Nov  1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
 /**
  * Autogenerated by Avro
  * 
@@ -61,6 +77,14 @@ public class ProtocolStatus extends org.
   "lastModified",
   };
 
+  /**
+   * Gets the total field count.
+   * @return int field count
+   */
+  public int getFieldsCount() {
+    return ProtocolStatus._ALL_FIELDS.length;
+  }
+  
   /** Bytes used to represent weather or not a field is dirty. */
   private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[1]);
   private int code;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java Sat Nov  1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
 /**
  * Autogenerated by Avro
  * 
@@ -100,6 +116,15 @@ public class WebPage extends org.apache.
   "batchId",
   };
 
+  /**
+   * Gets the total field count.
+   * @return int field count
+   */
+  public int getFieldsCount() {
+    return WebPage._ALL_FIELDS.length;
+  }
+
+  
   /** Bytes used to represent weather or not a field is dirty. */
   private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[4]);
   private CharSequence baseUrl;