You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/11/11 08:22:02 UTC
svn commit: r1638038 - in /nutch/trunk: build.xml conf/nutch-site.xml.template conf/regex-urlfilter.txt.template ivy/ivy.xml src/plugin/parse-tika/ivy.xml

Author: lewismc
Date: Tue Nov 11 07:22:02 2014
New Revision: 1638038

URL: http://svn.apache.org/r1638038
Log:
Correct formatting in build.xml Ant targets help

Modified:
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-site.xml.template
    nutch/trunk/conf/regex-urlfilter.txt.template
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/plugin/parse-tika/ivy.xml

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1638038&r1=1638037&r2=1638038&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Tue Nov 11 07:22:02 2014
@@ -912,7 +912,7 @@
   </path>
   
   <!-- target: ant-eclipse-download   =================================== -->
-  <target name="ant-eclipse-download" description="Downloads the ant-eclipse binary.">
+  <target name="ant-eclipse-download" description="--> Downloads the ant-eclipse binary">
     <get src="http://downloads.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
          dest="${build.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" />
 
@@ -929,7 +929,7 @@
   <!-- target: eclipse   ================================================ -->
   <target name="eclipse" 
           depends="clean-eclipse,init,resolve-test,job,ant-eclipse-download"
-          description="Create eclipse project files">
+          description="--> Create eclipse project files">
 
 	     <pathconvert property="eclipse.project">
 	       <path path="${basedir}"/>

Modified: nutch/trunk/conf/nutch-site.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-site.xml.template?rev=1638038&r1=1638037&r2=1638038&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-site.xml.template (original)
+++ nutch/trunk/conf/nutch-site.xml.template Tue Nov 11 07:22:02 2014
@@ -1,8 +1,132 @@
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-
 <!-- Put site-specific property overrides in this file. -->
-
 <configuration>
-
+<property>
+<name>http.agent.name</name>
+<value>My Nutch Spider</value>
+</property>
+<property>
+<name>plugin.includes</name>
+<value>protocol-http|protocol-httpclient|urlfilter-regex|parse-(html|tika|metatags|swf|zip)|subcollection|tld|urlmeta|creativecommons|language-identifier|microformats-reltag|feed|headings|index-(basic|anchor|metadata|more)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+<description>Regular expression naming plugin directory names to
+include. Any plugin not matching this expression is excluded.
+In any case you need at least include the nutch-extensionpoints plugin. By
+default Nutch includes crawling just HTML and plain text via HTTP,
+and basic indexing and search plugins. In order to use HTTPS please enable
+protocol-httpclient, but be aware of possible intermittent problems with the
+underlying commons-httpclient library.
+</description>
+</property>
+<!-- Used only if plugin parse-metatags is enabled. -->
+<property>
+<name>metatags.names</name>
+<value>description,keywords</value>
+<description> Names of the metatags to extract, separated by ','.
+Use '*' to extract all metatags. Prefixes the names with 'metatag.'
+in the parse-metadata. For instance to index description and keywords,
+you need to activate the plugin index-metadata and set the value of the
+parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
+</description>
+</property>
+<property>
+<name>index.parse.md</name>
+<value>metatag.description,metatag.keywords,producer</value>
+<description>
+Comma-separated list of keys to be taken from the parse metadata to generate fields.
+Can be used e.g. for 'description' or 'keywords' provided that these values are generated
+by a parser (see parse-metatags plugin)
+</description>
+</property>
+<property>
+<name>index.content.md</name>
+<value>ETag,Server</value>
+<description>
+Comma-separated list of keys to be taken from the content metadata to generate fields.
+Can be used e.g. for 'description' or 'keywords' provided that these values are generated
+by a parser (see parse-metatags plugin)
+</description>
+</property>
+<!--<property>
+     <name>index.db.md</name>
+     <value>xmpTPg:NPages,xmp:CreatorTool</value>
+     <description>
+     Comma-separated list of keys to be taken from the parse metadata to generate fields.
+     Can be used e.g. for 'description' or 'keywords' provided that these values are generated
+     by a parser (see parse-metatags plugin)
+     </description>
+     </property>-->
+<property>
+<name>file.content.limit</name>
+<value>-1</value>
+<description>The length limit for downloaded content, in bytes.
+If this value is nonnegative (>=0), content longer than it will be truncated;
+otherwise, no truncation at all.
+</description>
+</property>
+<property>
+<name>http.timeout</name>
+<value>30000</value>
+<description>The default network timeout, in milliseconds.</description>
+</property>
+<property>
+<name>http.content.limit</name>
+<value>-1</value>
+<description>The length limit for downloaded content using the http://
+protocol, in bytes. If this value is nonnegative (>=0), content longer
+than it will be truncated; otherwise, no truncation at all. Do not
+confuse this setting with the file.content.limit setting.
+</description>
+</property>
+<!-- solr index properties -->
+<property>
+<name>solr.server.url</name>
+<value>http://localhost:8983/solr/collection1</value>
+<description>
+Defines the name of the file that will be used in the mapping of internal
+nutch field names to solr index fields as specified in the target Solr schema.
+</description>
+</property>
+<property>
+<name>solr.commit.size</name>
+<value>1000</value>
+<description>
+Defines the number of documents to send to Solr in a single update batch.
+Decrease when handling very large documents to prevent Nutch from running
+out of memory. NOTE: It does not explicitly trigger a server side commit.
+</description>
+</property>
+<property>
+<name>solr.mapping.file</name>
+<value>solrindex-mapping.xml</value>
+<description>
+Defines the name of the file that will be used in the mapping of internal
+nutch field names to solr index fields as specified in the target Solr schema.
+</description>
+</property>
+<property>
+<name>solr.auth</name>
+<value>false</value>
+<description>
+Whether to enable HTTP basic authentication for communicating with Solr.
+Use the solr.auth.username and solr.auth.password properties to configure
+your credentials.
+</description>
+</property>
+<property>
+<name>moreIndexingFilter.indexMimeTypeParts</name>
+<value>false</value>
+<description>Determines whether the index-more plugin will split the mime-type
+in sub parts, this requires the type field to be multi valued. Set to true for backward
+compatibility. False will not split the mime-type.
+</description>
+</property>
+<property>
+<name>http.redirect.max</name>
+<value>5</value>
+<description>The maximum number of redirects the fetcher will follow when
+trying to fetch a page. If set to negative or 0, fetcher won't immediately
+follow redirected URLs, instead it will record them for later fetching.
+</description>
+</property>
 </configuration>

Modified: nutch/trunk/conf/regex-urlfilter.txt.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-urlfilter.txt.template?rev=1638038&r1=1638037&r2=1638038&view=diff
==============================================================================
--- nutch/trunk/conf/regex-urlfilter.txt.template (original)
+++ nutch/trunk/conf/regex-urlfilter.txt.template Tue Nov 11 07:22:02 2014
@@ -27,7 +27,7 @@
 
 # skip image and other suffixes we can't yet parse
 # for a more extensive coverage use the urlfilter-suffix plugin
--\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
+#-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
@@ -35,5 +35,9 @@
 # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
 -.*(/[^/]+)/[^/]+\1/[^/]+\1/
 
+
 # accept anything else
-+.
+# #+^http://([a-z0-9]*\.)*nutch.apache.org/
+# #+^file:/Users/AngelaWang/Documents/programs/oodt/cas-curator-0.6/staging/products/xml
+# #+^.
++^https://([a-zA-Z0-9]*\.)*www.aoncadis.org/

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1638038&r1=1638037&r2=1638038&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Nov 11 07:22:02 2014
@@ -60,7 +60,9 @@
 			<exclude org="ant" name="ant" />
 		</dependency>
 
-		<dependency org="org.apache.tika" name="tika-core" rev="1.6" />
+		<dependency org="org.apache.tika" name="tika-core" rev="1.7-SNAPSHOT" >
+		  <exclude module="slf4j-api" />
+		</dependency>
 		<dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
 
 		<dependency org="xerces" name="xercesImpl" rev="2.9.1" />

Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1638038&r1=1638037&r2=1638038&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Nov 11 07:22:02 2014
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.6" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.7-SNAPSHOT" conf="*->default" changing="true">
      <exclude org="org.apache.tika" name="tika-core" />
     </dependency>
   </dependencies>