You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/11/11 08:33:07 UTC

svn commit: r1638040 - in /nutch/trunk: build.xml conf/nutch-site.xml.template conf/regex-urlfilter.txt.template ivy/ivy.xml src/plugin/parse-tika/ivy.xml

Author: lewismc
Date: Tue Nov 11 07:33:06 2014
New Revision: 1638040

URL: http://svn.apache.org/r1638040
Log:
Revert bothed commit

Modified:
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-site.xml.template
    nutch/trunk/conf/regex-urlfilter.txt.template
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/plugin/parse-tika/ivy.xml

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Tue Nov 11 07:33:06 2014
@@ -912,7 +912,7 @@
   </path>
   
   <!-- target: ant-eclipse-download   =================================== -->
-  <target name="ant-eclipse-download" description="--> Downloads the ant-eclipse binary">
+  <target name="ant-eclipse-download" description="Downloads the ant-eclipse binary.">
     <get src="http://downloads.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
          dest="${build.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" />
 
@@ -929,7 +929,7 @@
   <!-- target: eclipse   ================================================ -->
   <target name="eclipse" 
           depends="clean-eclipse,init,resolve-test,job,ant-eclipse-download"
-          description="--> Create eclipse project files">
+          description="Create eclipse project files">
 
 	     <pathconvert property="eclipse.project">
 	       <path path="${basedir}"/>

Modified: nutch/trunk/conf/nutch-site.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-site.xml.template?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-site.xml.template (original)
+++ nutch/trunk/conf/nutch-site.xml.template Tue Nov 11 07:33:06 2014
@@ -1,132 +1,8 @@
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
 <!-- Put site-specific property overrides in this file. -->
+
 <configuration>
-<property>
-<name>http.agent.name</name>
-<value>My Nutch Spider</value>
-</property>
-<property>
-<name>plugin.includes</name>
-<value>protocol-http|protocol-httpclient|urlfilter-regex|parse-(html|tika|metatags|swf|zip)|subcollection|tld|urlmeta|creativecommons|language-identifier|microformats-reltag|feed|headings|index-(basic|anchor|metadata|more)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
-<description>Regular expression naming plugin directory names to
-include. Any plugin not matching this expression is excluded.
-In any case you need at least include the nutch-extensionpoints plugin. By
-default Nutch includes crawling just HTML and plain text via HTTP,
-and basic indexing and search plugins. In order to use HTTPS please enable
-protocol-httpclient, but be aware of possible intermittent problems with the
-underlying commons-httpclient library.
-</description>
-</property>
-<!-- Used only if plugin parse-metatags is enabled. -->
-<property>
-<name>metatags.names</name>
-<value>description,keywords</value>
-<description> Names of the metatags to extract, separated by ','.
-Use '*' to extract all metatags. Prefixes the names with 'metatag.'
-in the parse-metadata. For instance to index description and keywords,
-you need to activate the plugin index-metadata and set the value of the
-parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
-</description>
-</property>
-<property>
-<name>index.parse.md</name>
-<value>metatag.description,metatag.keywords,producer</value>
-<description>
-Comma-separated list of keys to be taken from the parse metadata to generate fields.
-Can be used e.g. for 'description' or 'keywords' provided that these values are generated
-by a parser (see parse-metatags plugin)
-</description>
-</property>
-<property>
-<name>index.content.md</name>
-<value>ETag,Server</value>
-<description>
-Comma-separated list of keys to be taken from the content metadata to generate fields.
-Can be used e.g. for 'description' or 'keywords' provided that these values are generated
-by a parser (see parse-metatags plugin)
-</description>
-</property>
-<!--<property>
-     <name>index.db.md</name>
-     <value>xmpTPg:NPages,xmp:CreatorTool</value>
-     <description>
-     Comma-separated list of keys to be taken from the parse metadata to generate fields.
-     Can be used e.g. for 'description' or 'keywords' provided that these values are generated
-     by a parser (see parse-metatags plugin)
-     </description>
-     </property>-->
-<property>
-<name>file.content.limit</name>
-<value>-1</value>
-<description>The length limit for downloaded content, in bytes.
-If this value is nonnegative (>=0), content longer than it will be truncated;
-otherwise, no truncation at all.
-</description>
-</property>
-<property>
-<name>http.timeout</name>
-<value>30000</value>
-<description>The default network timeout, in milliseconds.</description>
-</property>
-<property>
-<name>http.content.limit</name>
-<value>-1</value>
-<description>The length limit for downloaded content using the http://
-protocol, in bytes. If this value is nonnegative (>=0), content longer
-than it will be truncated; otherwise, no truncation at all. Do not
-confuse this setting with the file.content.limit setting.
-</description>
-</property>
-<!-- solr index properties -->
-<property>
-<name>solr.server.url</name>
-<value>http://localhost:8983/solr/collection1</value>
-<description>
-Defines the name of the file that will be used in the mapping of internal
-nutch field names to solr index fields as specified in the target Solr schema.
-</description>
-</property>
-<property>
-<name>solr.commit.size</name>
-<value>1000</value>
-<description>
-Defines the number of documents to send to Solr in a single update batch.
-Decrease when handling very large documents to prevent Nutch from running
-out of memory. NOTE: It does not explicitly trigger a server side commit.
-</description>
-</property>
-<property>
-<name>solr.mapping.file</name>
-<value>solrindex-mapping.xml</value>
-<description>
-Defines the name of the file that will be used in the mapping of internal
-nutch field names to solr index fields as specified in the target Solr schema.
-</description>
-</property>
-<property>
-<name>solr.auth</name>
-<value>false</value>
-<description>
-Whether to enable HTTP basic authentication for communicating with Solr.
-Use the solr.auth.username and solr.auth.password properties to configure
-your credentials.
-</description>
-</property>
-<property>
-<name>moreIndexingFilter.indexMimeTypeParts</name>
-<value>false</value>
-<description>Determines whether the index-more plugin will split the mime-type
-in sub parts, this requires the type field to be multi valued. Set to true for backward
-compatibility. False will not split the mime-type.
-</description>
-</property>
-<property>
-<name>http.redirect.max</name>
-<value>5</value>
-<description>The maximum number of redirects the fetcher will follow when
-trying to fetch a page. If set to negative or 0, fetcher won't immediately
-follow redirected URLs, instead it will record them for later fetching.
-</description>
-</property>
+
 </configuration>

Modified: nutch/trunk/conf/regex-urlfilter.txt.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-urlfilter.txt.template?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/conf/regex-urlfilter.txt.template (original)
+++ nutch/trunk/conf/regex-urlfilter.txt.template Tue Nov 11 07:33:06 2014
@@ -27,7 +27,7 @@
 
 # skip image and other suffixes we can't yet parse
 # for a more extensive coverage use the urlfilter-suffix plugin
-#-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
@@ -35,9 +35,5 @@
 # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
 -.*(/[^/]+)/[^/]+\1/[^/]+\1/
 
-
 # accept anything else
-# #+^http://([a-z0-9]*\.)*nutch.apache.org/
-# #+^file:/Users/AngelaWang/Documents/programs/oodt/cas-curator-0.6/staging/products/xml
-# #+^.
-+^https://([a-zA-Z0-9]*\.)*www.aoncadis.org/
++.

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Nov 11 07:33:06 2014
@@ -60,9 +60,7 @@
 			<exclude org="ant" name="ant" />
 		</dependency>
 
-		<dependency org="org.apache.tika" name="tika-core" rev="1.7-SNAPSHOT" >
-		  <exclude module="slf4j-api" />
-		</dependency>
+		<dependency org="org.apache.tika" name="tika-core" rev="1.6" />
 		<dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
 
 		<dependency org="xerces" name="xercesImpl" rev="2.9.1" />

Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1638040&r1=1638039&r2=1638040&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Nov 11 07:33:06 2014
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.7-SNAPSHOT" conf="*->default" changing="true">
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.6" conf="*->default">
      <exclude org="org.apache.tika" name="tika-core" />
     </dependency>
   </dependencies>