You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/11/11 08:33:07 UTC
svn commit: r1638040 - in /nutch/trunk: build.xml
conf/nutch-site.xml.template conf/regex-urlfilter.txt.template ivy/ivy.xml
src/plugin/parse-tika/ivy.xml
Author: lewismc
Date: Tue Nov 11 07:33:06 2014
New Revision: 1638040
URL: http://svn.apache.org/r1638040
Log:
Revert bothed commit
Modified:
nutch/trunk/build.xml
nutch/trunk/conf/nutch-site.xml.template
nutch/trunk/conf/regex-urlfilter.txt.template
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/plugin/parse-tika/ivy.xml
Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Tue Nov 11 07:33:06 2014
@@ -912,7 +912,7 @@
</path>
<!-- target: ant-eclipse-download =================================== -->
- <target name="ant-eclipse-download" description="--> Downloads the ant-eclipse binary">
+ <target name="ant-eclipse-download" description="Downloads the ant-eclipse binary.">
<get src="http://downloads.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
dest="${build.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" />
@@ -929,7 +929,7 @@
<!-- target: eclipse ================================================ -->
<target name="eclipse"
depends="clean-eclipse,init,resolve-test,job,ant-eclipse-download"
- description="--> Create eclipse project files">
+ description="Create eclipse project files">
<pathconvert property="eclipse.project">
<path path="${basedir}"/>
Modified: nutch/trunk/conf/nutch-site.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-site.xml.template?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-site.xml.template (original)
+++ nutch/trunk/conf/nutch-site.xml.template Tue Nov 11 07:33:06 2014
@@ -1,132 +1,8 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
<!-- Put site-specific property overrides in this file. -->
+
<configuration>
-<property>
-<name>http.agent.name</name>
-<value>My Nutch Spider</value>
-</property>
-<property>
-<name>plugin.includes</name>
-<value>protocol-http|protocol-httpclient|urlfilter-regex|parse-(html|tika|metatags|swf|zip)|subcollection|tld|urlmeta|creativecommons|language-identifier|microformats-reltag|feed|headings|index-(basic|anchor|metadata|more)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
-<description>Regular expression naming plugin directory names to
-include. Any plugin not matching this expression is excluded.
-In any case you need at least include the nutch-extensionpoints plugin. By
-default Nutch includes crawling just HTML and plain text via HTTP,
-and basic indexing and search plugins. In order to use HTTPS please enable
-protocol-httpclient, but be aware of possible intermittent problems with the
-underlying commons-httpclient library.
-</description>
-</property>
-<!-- Used only if plugin parse-metatags is enabled. -->
-<property>
-<name>metatags.names</name>
-<value>description,keywords</value>
-<description> Names of the metatags to extract, separated by ','.
-Use '*' to extract all metatags. Prefixes the names with 'metatag.'
-in the parse-metadata. For instance to index description and keywords,
-you need to activate the plugin index-metadata and set the value of the
-parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
-</description>
-</property>
-<property>
-<name>index.parse.md</name>
-<value>metatag.description,metatag.keywords,producer</value>
-<description>
-Comma-separated list of keys to be taken from the parse metadata to generate fields.
-Can be used e.g. for 'description' or 'keywords' provided that these values are generated
-by a parser (see parse-metatags plugin)
-</description>
-</property>
-<property>
-<name>index.content.md</name>
-<value>ETag,Server</value>
-<description>
-Comma-separated list of keys to be taken from the content metadata to generate fields.
-Can be used e.g. for 'description' or 'keywords' provided that these values are generated
-by a parser (see parse-metatags plugin)
-</description>
-</property>
-<!--<property>
- <name>index.db.md</name>
- <value>xmpTPg:NPages,xmp:CreatorTool</value>
- <description>
- Comma-separated list of keys to be taken from the parse metadata to generate fields.
- Can be used e.g. for 'description' or 'keywords' provided that these values are generated
- by a parser (see parse-metatags plugin)
- </description>
- </property>-->
-<property>
-<name>file.content.limit</name>
-<value>-1</value>
-<description>The length limit for downloaded content, in bytes.
-If this value is nonnegative (>=0), content longer than it will be truncated;
-otherwise, no truncation at all.
-</description>
-</property>
-<property>
-<name>http.timeout</name>
-<value>30000</value>
-<description>The default network timeout, in milliseconds.</description>
-</property>
-<property>
-<name>http.content.limit</name>
-<value>-1</value>
-<description>The length limit for downloaded content using the http://
-protocol, in bytes. If this value is nonnegative (>=0), content longer
-than it will be truncated; otherwise, no truncation at all. Do not
-confuse this setting with the file.content.limit setting.
-</description>
-</property>
-<!-- solr index properties -->
-<property>
-<name>solr.server.url</name>
-<value>http://localhost:8983/solr/collection1</value>
-<description>
-Defines the name of the file that will be used in the mapping of internal
-nutch field names to solr index fields as specified in the target Solr schema.
-</description>
-</property>
-<property>
-<name>solr.commit.size</name>
-<value>1000</value>
-<description>
-Defines the number of documents to send to Solr in a single update batch.
-Decrease when handling very large documents to prevent Nutch from running
-out of memory. NOTE: It does not explicitly trigger a server side commit.
-</description>
-</property>
-<property>
-<name>solr.mapping.file</name>
-<value>solrindex-mapping.xml</value>
-<description>
-Defines the name of the file that will be used in the mapping of internal
-nutch field names to solr index fields as specified in the target Solr schema.
-</description>
-</property>
-<property>
-<name>solr.auth</name>
-<value>false</value>
-<description>
-Whether to enable HTTP basic authentication for communicating with Solr.
-Use the solr.auth.username and solr.auth.password properties to configure
-your credentials.
-</description>
-</property>
-<property>
-<name>moreIndexingFilter.indexMimeTypeParts</name>
-<value>false</value>
-<description>Determines whether the index-more plugin will split the mime-type
-in sub parts, this requires the type field to be multi valued. Set to true for backward
-compatibility. False will not split the mime-type.
-</description>
-</property>
-<property>
-<name>http.redirect.max</name>
-<value>5</value>
-<description>The maximum number of redirects the fetcher will follow when
-trying to fetch a page. If set to negative or 0, fetcher won't immediately
-follow redirected URLs, instead it will record them for later fetching.
-</description>
-</property>
+
</configuration>
Modified: nutch/trunk/conf/regex-urlfilter.txt.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-urlfilter.txt.template?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/conf/regex-urlfilter.txt.template (original)
+++ nutch/trunk/conf/regex-urlfilter.txt.template Tue Nov 11 07:33:06 2014
@@ -27,7 +27,7 @@
# skip image and other suffixes we can't yet parse
# for a more extensive coverage use the urlfilter-suffix plugin
-#-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]
@@ -35,9 +35,5 @@
# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/[^/]+)/[^/]+\1/[^/]+\1/
-
# accept anything else
-# #+^http://([a-z0-9]*\.)*nutch.apache.org/
-# #+^file:/Users/AngelaWang/Documents/programs/oodt/cas-curator-0.6/staging/products/xml
-# #+^.
-+^https://([a-zA-Z0-9]*\.)*www.aoncadis.org/
++.
Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Nov 11 07:33:06 2014
@@ -60,9 +60,7 @@
<exclude org="ant" name="ant" />
</dependency>
- <dependency org="org.apache.tika" name="tika-core" rev="1.7-SNAPSHOT" >
- <exclude module="slf4j-api" />
- </dependency>
+ <dependency org="org.apache.tika" name="tika-core" rev="1.6" />
<dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
<dependency org="xerces" name="xercesImpl" rev="2.9.1" />
Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Nov 11 07:33:06 2014
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="org.apache.tika" name="tika-parsers" rev="1.7-SNAPSHOT" conf="*->default" changing="true">
+ <dependency org="org.apache.tika" name="tika-parsers" rev="1.6" conf="*->default">
<exclude org="org.apache.tika" name="tika-core" />
</dependency>
</dependencies>