You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/11/01 17:42:32 UTC
svn commit: r1635988 - in /nutch/branches/2.x: ./ conf/ ivy/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/parse/
src/java/org/apache/nutch/storage/
Author: lewismc
Date: Sat Nov 1 16:42:32 2014
New Revision: 1635988
URL: http://svn.apache.org/r1635988
Log:
Upgrade to Gora 0.5
Added:
nutch/branches/2.x/conf/gora-mongodb-mapping.xml
nutch/branches/2.x/conf/gora-solr-host-schema.xml
nutch/branches/2.x/conf/gora-solr-mapping.xml
nutch/branches/2.x/conf/gora-solr-webpage-schema.xml
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/gora.properties
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov 1 16:42:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.3-SNAPSHOT
+* NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib)
+
* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel)
* NUTCH-1882 ant eclipse target to add output path to src/test (snagel)
Added: nutch/branches/2.x/conf/gora-mongodb-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-mongodb-mapping.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-mongodb-mapping.xml (added)
+++ nutch/branches/2.x/conf/gora-mongodb-mapping.xml Sat Nov 1 16:42:32 2014
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ More information on gora-mongodb configuration and mapping's can be found
+ at http://gora.apache.org/current/gora-mongodb.html
+-->
+<gora-otd>
+
+ <class name="org.apache.nutch.storage.WebPage" keyClass="java.lang.String" document="webpage">
+ <!-- fetch fields -->
+ <field name="baseUrl" docfield="baseUrl" type="string"/>
+ <field name="status" docfield="status" type="int32"/>
+ <field name="prevFetchTime" docfield="prevFetchTime" type="int64"/>
+ <field name="fetchTime" docfield="fetchTime" type="int64"/>
+ <field name="fetchInterval" docfield="fetchInterval" type="int64"/>
+ <field name="retriesSinceFetch" docfield="retriesSinceFetch" type="int32"/>
+ <field name="reprUrl" docfield="reprUrl" type="string"/>
+ <field name="content" docfield="content" type="binary"/>
+ <field name="contentType" docfield="contentType" type="string"/>
+ <field name="modifiedTime" docfield="modifiedTime" type="int64"/>
+ <field name="prevModifiedTime" docfield="prevModifiedTime" type="int64"/>
+ <field name="batchId" docfield="batchId" type="string"/>
+
+ <!-- parse fields -->
+ <field name="title" docfield="title" type="string"/>
+ <field name="text" docfield="text" type="binary"/>
+ <field name="signature" docfield="signature" type="string"/>
+ <field name="prevSignature" docfield="prevSignature" type="string"/>
+ <!-- score fields -->
+ <field name="score" docfield="score" type="int32"/>
+ <field name="headers" docfield="headers" type="document"/>
+ <field name="inlinks" docfield="inlinks" type="document"/>
+ <field name="outlinks" docfield="outlinks" type="document"/>
+ <field name="metadata" docfield="metadata" type="document"/>
+ <field name="markers" docfield="markers" type="document"/>
+ <field name="parseStatus" docfield="parseStatus" type="document"/>
+ <field name="protocolStatus" docfield="protocolStatus" type="document"/>
+ </class>
+
+ <class name="org.apache.nutch.storage.Host" keyClass="java.lang.String" document="host">
+ <field name="metadata" docfield="metadata" type="document"/>
+ <field name="inlinks" docfield="inlinks" type="document"/>
+ <field name="outlinks" docfield="outlinks" type="document"/>
+ </class>
+
+</gora-otd>
Added: nutch/branches/2.x/conf/gora-solr-host-schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-host-schema.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-host-schema.xml (added)
+++ nutch/branches/2.x/conf/gora-solr-host-schema.xml Sat Nov 1 16:42:32 2014
@@ -0,0 +1,331 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ Description:
+ This document contains Solr 4.x schema definition to enable Gora-Solr
+ integration currently built into Nutch 2.3 onwards.
+ This schema is not minimal, there are some useful field type definitions left,
+ and the set of fields and their flags (indexed/stored/term vectors) can be
+ further optimized depending on needs. See
+ http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+ for more Solr specific info.
+ See
+ http://gora.apache.org/current/gora-solr.html
+ for gora-solr specific documentation.
+-->
+
+<schema name="nutch_gora_host_solr" version="1.5">
+
+ <types>
+
+ <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+
+ <!--
+ Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+ -->
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+ <!--
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for NumericRangeQuery for internal
+ implementation details.
+
+ Smaller precisionStep values (specified in bits) will lead to more tokens
+ indexed per value, slightly larger index size, and faster range queries.
+ A precisionStep of 0 disables indexing at different precision levels.
+ -->
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+ is a more restricted form of the canonical representation of dateTime
+ http://www.w3.org/TR/xmlschema-2/#dateTime
+ The trailing "Z" designates UTC time and is mandatory.
+ Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+ All other components are mandatory.
+
+ Expressions can also be used to denote calculations that should be
+ performed relative to "NOW" to determine the value, ie...
+
+ NOW/HOUR
+ ... Round to the start of the current hour
+ NOW-1DAY
+ ... Exactly 1 day prior to now
+ NOW/DAY+6MONTHS+3DAYS
+ ... 6 months and 3 days in the future from the start of
+ the current day
+
+ Consult the DateField javadocs for more information.
+
+ Note: For faster range queries, consider the tdate type
+ -->
+ <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+
+ <!-- A Trie based date field for faster date range queries and date faceting. -->
+ <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+
+ <!-- solr.TextField allows the specification of custom text analyzers
+ specified as a tokenizer and a list of token filters. Different
+ analyzers may be specified for indexing and querying.
+
+ The optional positionIncrementGap puts space between multiple fields of
+ this type on the same document, with the purpose of preventing false phrase
+ matching across fields.
+
+ For more info on customizing your analyzer chain, please see
+ http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+ -->
+
+ <!-- A general text field that has reasonable, generic
+ cross-language defaults: it tokenizes with StandardTokenizer,
+ removes stop words from case-insensitive "stopwords.txt"
+ (empty by default), and down cases. At query time only, it
+ also applies synonyms. -->
+ <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English: it
+ tokenizes with StandardTokenizer, removes English stop words
+ (stopwords.txt), down cases, protects words from protwords.txt, and
+ finally applies Porter's stemming. The query time analyzer
+ also applies synonyms from synonyms.txt. -->
+ <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English, plus
+ aggressive word-splitting and autophrase features enabled.
+ This field is just like text_en, except it adds
+ WordDelimiterFilter to enable splitting and matching of
+ words on case-change, alpha numeric boundaries, and
+ non-alphanumeric chars. This means certain compound word
+ cases will work, for example query "wi fi" will match
+ document "WiFi" or "wi-fi". However, other cases will still
+ not match, for example if the query is "wifi" and the
+ document is "wi fi" or if the query is "wi-fi" and the
+ document is "wifi".
+ -->
+ <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
+ but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
+ <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Just like text_general except it reverses the characters of
+ each token, to enable more efficient leading wildcard queries. -->
+ <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!--
+ The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+ a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
+ Attributes of the DelimitedPayloadTokenFilterFactory :
+ "delimiter" - a one character delimiter. Default is | (pipe)
+ "encoder" - how to encode the following value into a playload
+ float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+ integer -> o.a.l.a.p.IntegerEncoder
+ identity -> o.a.l.a.p.IdentityEncoder
+ Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+ -->
+ <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- lowercases the entire field value, keeping it as a single token. -->
+ <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
+ </analyzer>
+ </fieldType>
+
+
+ <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- since fields of this type are by default not stored or indexed,
+ any data added to them will be ignored outright. -->
+ <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+ </types>
+
+ <fields>
+ <!-- This field is used internally by Solr, for example by features
+ like partial update functionality and update log. It is NOT required
+ if updateLog is turned off in your updateHandler, however it is advised
+ to include it as performance improvements are minimal. -->
+ <field name="_version_" type="long" indexed="true" stored="true"/>
+
+ <field name="metadata" type="string" stored="true" indexed="true" required="true"/>
+
+ <!-- fetch fields -->
+ <field name="inlinks" type="string" stored="true" indexed="false"/>
+ <field name="outlinks" type="string" stored="true" indexed="false"/>
+
+ </fields>
+ <uniqueKey>metadata</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field differently,
+ or to add multiple fields to the same field for easier/faster searching. -->
+
+ <!-- copyField source="" dest=""/-->
+
+</schema>
Added: nutch/branches/2.x/conf/gora-solr-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-mapping.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-mapping.xml (added)
+++ nutch/branches/2.x/conf/gora-solr-mapping.xml Sat Nov 1 16:42:32 2014
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ More information on gora-solr configuration and mapping's can be found
+ at http://gora.apache.org/current/gora-solr.html
+-->
+<gora-otd>
+
+ <class name="org.apache.nutch.storage.WebPage" keyClass="java.lang.String" table="webpage">
+ <!-- fetch fields -->
+ <primarykey column="baseUrl"/>
+ <field name="status" column="status"/>
+ <field name="prevFetchTime" column="prevFetchTime" />
+ <field name="fetchTime" column="fetchTime" />
+ <field name="fetchInterval" column="fetchInterval"/>
+ <field name="retriesSinceFetch" column="retriesSinceFetch"/>
+ <field name="reprUrl" column="reprUrl"/>
+ <field name="content" column="content"/>
+ <field name="contentType" column="contentType"/>
+ <field name="modifiedTime" column="modifiedTime"/>
+ <field name="prevModifiedTime" column="prevModifiedTime" />
+ <field name="batchId" column="batchId" />
+
+ <!-- parse fields -->
+ <field name="title" column="title" />
+ <field name="text" column="text" />
+ <field name="signature" column="signature" />
+ <field name="prevSignature" column="prevSignature"/>
+ <!-- score fields -->
+ <field name="score" column="score"/>
+ <field name="headers" column="headers"/>
+ <field name="inlinks" column="inlinks" />
+ <field name="outlinks" column="outlinks"/>
+ <field name="metadata" column="metadata" />
+ <field name="markers" column="markers" />
+ <field name="parseStatus" column="parseStatus"/>
+ <field name="protocolStatus" column="protocolStatus" />
+ </class>
+
+ <class name="org.apache.nutch.storage.Host" keyClass="java.lang.String" table="host">
+ <field name="metadata" column="metadata" />
+ <field name="inlinks" column="inlinks" />
+ <field name="outlinks" column="outlinks" />
+ </class>
+</gora-otd>
+
Added: nutch/branches/2.x/conf/gora-solr-webpage-schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-webpage-schema.xml?rev=1635988&view=auto
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-webpage-schema.xml (added)
+++ nutch/branches/2.x/conf/gora-solr-webpage-schema.xml Sat Nov 1 16:42:32 2014
@@ -0,0 +1,359 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ Description:
+ This document contains Solr 4.x schema definition to enable Gora-Solr
+ integration currently built into Nutch 2.3 onwards.
+ This schema is not minimal, there are some useful field type definitions left,
+ and the set of fields and their flags (indexed/stored/term vectors) can be
+ further optimized depending on needs. See
+ http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+ for more Solr specific info.
+ See
+ http://gora.apache.org/current/gora-solr.html
+ for gora-solr specific documentation.
+-->
+
+<schema name="nutch_gora_webpage_solr" version="1.5">
+
+ <types>
+
+ <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+
+ <!--
+ Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+ -->
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+ <!--
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for NumericRangeQuery for internal
+ implementation details.
+
+ Smaller precisionStep values (specified in bits) will lead to more tokens
+ indexed per value, slightly larger index size, and faster range queries.
+ A precisionStep of 0 disables indexing at different precision levels.
+ -->
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+ is a more restricted form of the canonical representation of dateTime
+ http://www.w3.org/TR/xmlschema-2/#dateTime
+ The trailing "Z" designates UTC time and is mandatory.
+ Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+ All other components are mandatory.
+
+ Expressions can also be used to denote calculations that should be
+ performed relative to "NOW" to determine the value, ie...
+
+ NOW/HOUR
+ ... Round to the start of the current hour
+ NOW-1DAY
+ ... Exactly 1 day prior to now
+ NOW/DAY+6MONTHS+3DAYS
+ ... 6 months and 3 days in the future from the start of
+ the current day
+
+ Consult the DateField javadocs for more information.
+
+ Note: For faster range queries, consider the tdate type
+ -->
+ <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+
+ <!-- A Trie based date field for faster date range queries and date faceting. -->
+ <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+
+ <!-- solr.TextField allows the specification of custom text analyzers
+ specified as a tokenizer and a list of token filters. Different
+ analyzers may be specified for indexing and querying.
+
+ The optional positionIncrementGap puts space between multiple fields of
+ this type on the same document, with the purpose of preventing false phrase
+ matching across fields.
+
+ For more info on customizing your analyzer chain, please see
+ http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+ -->
+
+ <!-- A general text field that has reasonable, generic
+ cross-language defaults: it tokenizes with StandardTokenizer,
+ removes stop words from case-insensitive "stopwords.txt"
+ (empty by default), and down cases. At query time only, it
+ also applies synonyms. -->
+ <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English: it
+ tokenizes with StandardTokenizer, removes English stop words
+ (stopwords.txt), down cases, protects words from protwords.txt, and
+ finally applies Porter's stemming. The query time analyzer
+ also applies synonyms from synonyms.txt. -->
+ <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English, plus
+ aggressive word-splitting and autophrase features enabled.
+ This field is just like text_en, except it adds
+ WordDelimiterFilter to enable splitting and matching of
+ words on case-change, alpha numeric boundaries, and
+ non-alphanumeric chars. This means certain compound word
+ cases will work, for example query "wi fi" will match
+ document "WiFi" or "wi-fi". However, other cases will still
+ not match, for example if the query is "wifi" and the
+ document is "wi fi" or if the query is "wi-fi" and the
+ document is "wifi".
+ -->
+ <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
+ but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
+ <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Just like text_general except it reverses the characters of
+ each token, to enable more efficient leading wildcard queries. -->
+ <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!--
+ The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+ a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
+ Attributes of the DelimitedPayloadTokenFilterFactory :
+ "delimiter" - a one character delimiter. Default is | (pipe)
+ "encoder" - how to encode the following value into a playload
+ float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+ integer -> o.a.l.a.p.IntegerEncoder
+ identity -> o.a.l.a.p.IdentityEncoder
+ Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+ -->
+ <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- lowercases the entire field value, keeping it as a single token. -->
+ <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
+ </analyzer>
+ </fieldType>
+
+
+ <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- since fields of this type are by default not stored or indexed,
+ any data added to them will be ignored outright. -->
+ <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+ </types>
+
+ <fields>
+ <!-- This field is used internally by Solr, for example by features
+ like partial update functionality and update log. It is NOT required
+ if updateLog is turned off in your updateHandler, however it is advised
+ to include it as performance improvements are minimal. -->
+ <field name="_version_" type="long" indexed="true" stored="true"/>
+
+ <field name="baseUrl" type="string" stored="true" indexed="true" required="true"/>
+
+ <!-- fetch fields -->
+ <field name="status" type="string" stored="true" indexed="false"/>
+ <field name="prevFetchTime" type="string" stored="true" indexed="false"/>
+ <field name="fetchTime" type="float" stored="true" indexed="false"/>
+ <field name="fetchInterval" type="url" stored="false" indexed="true"/>
+ <field name="retriesSinceFetch" type="url" stored="true" indexed="true"/>
+ <field name="contentType" type="url" stored="true" indexed="true" />
+ <field name="reprUrl" type="url" stored="true" indexed="true" />
+ <field name="signature" type="url" stored="true" indexed="true" />
+ <field name="prevSignature" type="url" stored="true" indexed="true" />
+ <field name="modifiedTime" type="url" stored="true" indexed="true" />
+ <field name="prevModifiedTime" type="url" stored="true" indexed="true" />
+ <field name="batchId" type="url" stored="true" indexed="true" />
+ <field name="reprUrl" type="url" stored="true" indexed="true" />
+
+ <!-- stored=true for highlighting, use term vectors and positions for fast highlighting -->
+ <field name="content" type="text_general" stored="true" indexed="true"/>
+ <field name="title" type="text_general" stored="true" indexed="true" multiValued="true"/>
+
+ <!-- score fields -->
+ <field name="score" type="url" stored="true" indexed="true" />
+ <field name="headers" type="url" stored="true" indexed="true" />
+ <field name="inlinks" type="url" stored="true" indexed="true" />
+ <field name="outlinks" type="url" stored="true" indexed="true" />
+ <field name="metadata" type="url" stored="true" indexed="true" />
+ <field name="markers" type="url" stored="true" indexed="true" />
+ <field name="parseStatus" type="url" stored="true" indexed="true" />
+ <field name="protocolStatus" type="url" stored="true" indexed="true" />
+
+ </fields>
+ <uniqueKey>baseUrl</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field differently,
+ or to add multiple fields to the same field for easier/faster searching. -->
+
+ <copyField source="content" dest="text"/>
+ <copyField source="url" dest="text"/>
+ <copyField source="title" dest="text"/>
+
+
+</schema>
Modified: nutch/branches/2.x/conf/gora.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora.properties?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora.properties (original)
+++ nutch/branches/2.x/conf/gora.properties Sat Nov 1 16:42:32 2014
@@ -13,6 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# MORE DOCUMENTATION ON ALL OF THESE PROPERTIES CAN BE FOUND AT
+# http://gora.apache.org/current/
+
#gora.datastore.default=org.apache.gora.mock.store.MockDataStore
#gora.datastore.autocreateschema=true
@@ -83,3 +86,25 @@
#gora.datastore.accumulo.user=root
#gora.datastore.accumulo.password=secret
+############################
+# SolrStore properties #
+############################
+#gora.datastore.default=org.apache.gora.solr.store.SolrStore
+#gora.solrstore.solr.url=http://localhost:9876/solr
+#gora.solrstore.solr.config=solrconfig.xml
+#gora.solrstore.solr.schema=gora-solr-schema.xml
+#gora.solrstore.solr.batchSize=100
+#gora.solrstore.solr.solrjserver=http
+#gora.solrstore.solr.commitWithin=1000
+#gora.solrstore.solr.resultsSize=100
+
+############################
+# MongoDBStore properties #
+############################
+#gora.datastore.default=org.apache.gora.mongodb.store.MongoStore
+#gora.mongodb.override_hadoop_configuration=false
+#gora.mongodb.mapping.file=/gora-mongodb-mapping.xml
+#gora.mongodb.servers=localhost:27017
+#gora.mongodb.db=mytestdatabase
+#gora.mongodb.login=login
+#gora.mongodb.secret=secret
Modified: nutch/branches/2.x/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Sat Nov 1 16:42:32 2014
@@ -103,7 +103,7 @@
<!-- N.B. To use Gora SNAPSHOT's merely replace the 'ref' value with the SNAPSHOT version
and add changing="true" alongside the dependency declaration. An example has been
provided for the gora-core dependency as below -->
- <dependency org="org.apache.gora" name="gora-core" rev="0.4" conf="*->default"/>
+ <dependency org="org.apache.gora" name="gora-core" rev="0.5" conf="*->default"/>
<!-- Uncomment this to use SQL as Gora backend. It should be noted that the
gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core 0.3. Users should
@@ -116,35 +116,43 @@
<dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/>
-->
<!-- Uncomment this to use HBase as Gora backend. -->
- <!--
- <dependency org="org.apache.gora" name="gora-hbase" rev="0.4" conf="*->default" />
+ <!--
+ <dependency org="org.apache.gora" name="gora-hbase" rev="0.5" conf="*->default" />
-->
<!-- Uncomment this to use Accumulo as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-accumulo" rev="0.4" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-accumulo" rev="0.5" conf="*->default" />
-->
<!-- Uncomment this to use Cassandra as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-cassandra" rev="0.4" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-cassandra" rev="0.5" conf="*->default" />
-->
-
- <!-- web app dependencies -->
+ <!-- Uncomment this to use MongoDB as Gora backend. -->
+ <!--
+ <dependency org="org.apache.gora" name="gora-mongodb" rev="0.5" conf="*->default" />
+ -->
+ <!-- Uncomment this to use Solr as Gora backend. -->
+ <!--
+ <dependency org="org.apache.gora" name="gora-solr" rev="0.5" conf="*->default" />
+ -->
+
+ <!-- web app dependencies -->
- <dependency org="org.apache.commons" name="commons-collections4" rev="4.0" conf="*->default" />
- <dependency org="org.springframework" name="spring-core" rev="4.0.4.RELEASE" conf="*->default" />
- <dependency org="org.springframework" name="spring-context" rev="4.0.4.RELEASE" conf="*->default" />
- <dependency org="org.springframework" name="spring-web" rev="4.0.4.RELEASE" conf="*->default" />
+ <dependency org="org.apache.commons" name="commons-collections4" rev="4.0" conf="*->default" />
+ <dependency org="org.springframework" name="spring-core" rev="4.0.4.RELEASE" conf="*->default" />
+ <dependency org="org.springframework" name="spring-context" rev="4.0.4.RELEASE" conf="*->default" />
+ <dependency org="org.springframework" name="spring-web" rev="4.0.4.RELEASE" conf="*->default" />
- <dependency org="com.sun.jersey" name="jersey-client" rev="1.8" conf="*->default" />
+ <dependency org="com.sun.jersey" name="jersey-client" rev="1.8" conf="*->default" />
- <dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="4.48" conf="*->default" />
- <dependency org="com.h2database" name="h2" rev="1.4.180" conf="*->default" />
- <dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.0.0" conf="*->default" />
+ <dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="4.48" conf="*->default" />
+ <dependency org="com.h2database" name="h2" rev="1.4.180" conf="*->default" />
+ <dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.0.0" conf="*->default" />
- <dependency org="org.apache.wicket" name="wicket-core" rev="6.16.0" conf="*->default" />
- <dependency org="org.apache.wicket" name="wicket-spring" rev="6.16.0" conf="*->default" />
- <dependency org="de.agilecoders.wicket" name="wicket-bootstrap-core" rev="0.9.2" conf="*->default" />
- <dependency org="de.agilecoders.wicket" name="wicket-bootstrap-extensions" rev="0.9.2" conf="*->default" />
+ <dependency org="org.apache.wicket" name="wicket-core" rev="6.16.0" conf="*->default" />
+ <dependency org="org.apache.wicket" name="wicket-spring" rev="6.16.0" conf="*->default" />
+ <dependency org="de.agilecoders.wicket" name="wicket-bootstrap-core" rev="0.9.2" conf="*->default" />
+ <dependency org="de.agilecoders.wicket" name="wicket-bootstrap-extensions" rev="0.9.2" conf="*->default" />
<!--global exclusion -->
<exclude module="ant" />
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Sat Nov 1 16:42:32 2014
@@ -16,7 +16,6 @@
******************************************************************************/
package org.apache.nutch.crawl;
-import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.query.Query;
import org.apache.gora.query.Result;
@@ -371,7 +370,15 @@ public class WebTableReader extends Nutc
sb.append("score:\t" + page.getScore()).append("\n");
Map<CharSequence, CharSequence> markers = page.getMarkers();
- sb.append("markers:\t" + markers).append("\n");
+ if (markers != null) {
+ Iterator<Entry<CharSequence, CharSequence>> iterator = markers.entrySet()
+ .iterator();
+ while (iterator.hasNext()) {
+ Entry<CharSequence, CharSequence> entry = iterator.next();
+ sb.append("marker " + entry.getKey().toString()).append(" : \t")
+ .append(entry.getValue()).append("\n");
+ }
+ }
sb.append("reprUrl:\t" + page.getReprUrl()).append("\n");
CharSequence batchId = page.getBatchId();
if (batchId != null) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Sat Nov 1 16:42:32 2014
@@ -104,9 +104,9 @@ public class ParserChecker implements To
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
WebPage page = WebPage.newBuilder().build();
-
+
ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
-
+
if(!protocolOutput.getStatus().isSuccess()) {
LOG.error("Fetch failed with protocol status: "
+ ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
@@ -114,7 +114,7 @@ public class ParserChecker implements To
return (-1);
}
Content content = protocolOutput.getContent();
-
+
if (content == null) {
LOG.error("No content for " + url);
return (-1);
@@ -145,10 +145,10 @@ public class ParserChecker implements To
LOG.error("Problem with parse - check log");
return (-1);
}
-
+
// Calculate the signature
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);
-
+
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
@@ -167,7 +167,7 @@ public class ParserChecker implements To
while (iterator.hasNext()) {
Entry<CharSequence, ByteBuffer> entry = iterator.next();
sb.append(entry.getKey().toString()).append(" : \t")
- .append(Bytes.toString(entry.getValue())).append("\n");
+ .append(Bytes.toString(entry.getValue())).append("\n");
}
System.out.print(sb.toString());
}
@@ -177,6 +177,21 @@ public class ParserChecker implements To
sb.append(" outlink: ").append(l).append('\n');
}
System.out.print(sb.toString());
+ if (page.getHeaders() != null) {
+ LOG.info("---------\nHeaders\n---------\n");
+ Map<CharSequence, CharSequence> headers = page.getHeaders();
+ StringBuffer headersb = new StringBuffer();
+ if (metadata != null) {
+ Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet()
+ .iterator();
+ while (iterator.hasNext()) {
+ Entry<CharSequence, CharSequence> entry = iterator.next();
+ headersb.append(entry.getKey().toString()).append(" : \t")
+ .append(entry.getValue()).append("\n");
+ }
+ System.out.print(headersb.toString());
+ }
+ }
if (dumpText) {
LOG.info("---------\nParseText\n---------\n");
System.out.print(parse.getText());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java Sat Nov 1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
/**
* Autogenerated by Avro
*
@@ -61,6 +77,14 @@ public class Host extends org.apache.gor
"outlinks",
"inlinks",
};
+
+ /**
+ * Gets the total field count.
+ * @return int field count
+ */
+ public int getFieldsCount() {
+ return Host._ALL_FIELDS.length;
+ }
/** Bytes used to represent weather or not a field is dirty. */
private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[1]);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java Sat Nov 1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
/**
* Autogenerated by Avro
*
@@ -57,6 +73,14 @@ public class ParseStatus extends org.apa
"minorCode",
"args",
};
+
+ /**
+ * Gets the total field count.
+ * @return int field count
+ */
+ public int getFieldsCount() {
+ return ParseStatus._ALL_FIELDS.length;
+ }
/** Bytes used to represent weather or not a field is dirty. */
private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[1]);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java Sat Nov 1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
/**
* Autogenerated by Avro
*
@@ -61,6 +77,14 @@ public class ProtocolStatus extends org.
"lastModified",
};
+ /**
+ * Gets the total field count.
+ * @return int field count
+ */
+ public int getFieldsCount() {
+ return ProtocolStatus._ALL_FIELDS.length;
+ }
+
/** Bytes used to represent weather or not a field is dirty. */
private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[1]);
private int code;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java?rev=1635988&r1=1635987&r2=1635988&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java Sat Nov 1 16:42:32 2014
@@ -1,3 +1,19 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
/**
* Autogenerated by Avro
*
@@ -100,6 +116,15 @@ public class WebPage extends org.apache.
"batchId",
};
+ /**
+ * Gets the total field count.
+ * @return int field count
+ */
+ public int getFieldsCount() {
+ return WebPage._ALL_FIELDS.length;
+ }
+
+
/** Bytes used to represent weather or not a field is dirty. */
private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[4]);
private CharSequence baseUrl;