You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/07/02 19:19:45 UTC
svn commit: r960064 - in /nutch/trunk: ./ conf/ docs/ src/engines/
src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/clustering/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/html/
src/java/org/apache/nutch/indexer/ src/java/or...
Author: ab
Date: Fri Jul 2 17:19:43 2010
New Revision: 960064
URL: http://svn.apache.org/viewvc?rev=960064&view=rev
Log:
NUTCH-837 Remove search servers and Lucene dependencies.
Added:
nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java (with props)
Removed:
nutch/trunk/conf/common-terms.utf8
nutch/trunk/conf/custom-fields.xml
nutch/trunk/docs/
nutch/trunk/src/engines/
nutch/trunk/src/java/org/apache/nutch/analysis/
nutch/trunk/src/java/org/apache/nutch/clustering/
nutch/trunk/src/java/org/apache/nutch/html/
nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java
nutch/trunk/src/java/org/apache/nutch/indexer/HighFreqTerms.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
nutch/trunk/src/java/org/apache/nutch/indexer/NutchSimilarity.java
nutch/trunk/src/java/org/apache/nutch/indexer/field/
nutch/trunk/src/java/org/apache/nutch/indexer/lucene/
nutch/trunk/src/java/org/apache/nutch/ontology/
nutch/trunk/src/java/org/apache/nutch/searcher/
nutch/trunk/src/java/org/apache/nutch/servlet/
nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
nutch/trunk/src/java/org/apache/nutch/tools/SearchLoadTester.java
nutch/trunk/src/java/org/apache/nutch/tools/compat/
nutch/trunk/src/plugin/analysis-de/
nutch/trunk/src/plugin/analysis-fr/
nutch/trunk/src/plugin/clustering-carrot2/
nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java
nutch/trunk/src/plugin/field-basic/
nutch/trunk/src/plugin/field-boost/
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
nutch/trunk/src/plugin/ontology/
nutch/trunk/src/plugin/query-basic/
nutch/trunk/src/plugin/query-custom/
nutch/trunk/src/plugin/query-more/
nutch/trunk/src/plugin/query-site/
nutch/trunk/src/plugin/query-url/
nutch/trunk/src/plugin/response-json/
nutch/trunk/src/plugin/response-xml/
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/
nutch/trunk/src/plugin/summary-basic/
nutch/trunk/src/plugin/summary-lucene/
nutch/trunk/src/test/org/apache/nutch/analysis/
nutch/trunk/src/test/org/apache/nutch/clustering/
nutch/trunk/src/test/org/apache/nutch/db/
nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java
nutch/trunk/src/test/org/apache/nutch/ontology/
nutch/trunk/src/test/org/apache/nutch/searcher/
nutch/trunk/src/web/
nutch/trunk/src/xmlcatalog/
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/conf/log4j.properties
nutch/trunk/conf/nutch-default.xml
nutch/trunk/default.properties
nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
nutch/trunk/src/plugin/build.xml
nutch/trunk/src/plugin/creativecommons/plugin.xml
nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
nutch/trunk/src/plugin/languageidentifier/plugin.xml
nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
nutch/trunk/src/plugin/microformats-reltag/plugin.xml
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
nutch/trunk/src/plugin/subcollection/plugin.xml
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jul 2 17:19:43 2010
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.0 - Current Development
+* NUTCH-837 Remove search servers and Lucene dependencies (ab)
+
* NUTCH-836 Remove deprecated parse plugins (jnioche)
* NUTCH-835 Document deduplication failed using MD5Signature (Sebastian Nagel via ab)
Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Fri Jul 2 17:19:43 2010
@@ -43,12 +43,6 @@
<pathelement location="${build.dir}/${final.name}.job" />
</path>
- <!-- xmlcatalog definition for xslt task -->
- <xmlcatalog id="docDTDs">
- <dtd publicId="-//W3C//DTD XHTML 1.0 Transitional//EN"
- location="${xmlcatalog.dir}/xhtml1-transitional.dtd"/>
- </xmlcatalog>
-
<!-- ====================================================== -->
<!-- Stuff needed by all targets -->
<!-- ====================================================== -->
@@ -113,18 +107,6 @@
<ant dir="src/plugin" target="deploy" inheritAll="false"/>
</target>
- <target name="generate-src" depends="init">
- <javacc target="${src.dir}/org/apache/nutch/analysis/NutchAnalysis.jj"
- javacchome="${javacc.home}">
- </javacc>
-
- <fixcrlf srcdir="${src.dir}" eol="lf" includes="**/*.java"/>
-
- </target>
-
- <target name="dynamic" depends="generate-src, compile">
- </target>
-
<!-- ================================================================== -->
<!-- Make nutch.jar -->
<!-- ================================================================== -->
@@ -163,57 +145,6 @@
</target>
<!-- ================================================================== -->
- <!-- Make nutch.war -->
- <!-- ================================================================== -->
- <!-- -->
- <!-- ================================================================== -->
- <target name="war" depends="jar,compile,generate-docs">
-
- <!-- generate the nutch.xml (servlet context) file -->
- <xslt in="${basedir}/conf/nutch-default.xml"
- out="${build.dir}/nutch.xml"
- style="${basedir}/conf/context.xsl">
- <xmlcatalog refid="docDTDs"/>
- <outputproperty name="indent" value="yes"/>
- </xslt>
- <war destfile="${build.dir}/${final.name}.war"
- webxml="${web.src.dir}/web.xml">
- <fileset dir="${web.src.dir}/jsp"/>
- <zipfileset dir="${docs.src}" includes="include/*.html"/>
- <zipfileset dir="${build.docs}" includes="*/include/*.html"/>
- <fileset dir="${docs.dir}"/>
- <lib dir="${lib.dir}">
- <include name="lucene*.jar"/>
- <include name="taglibs-*.jar"/>
- <include name="hadoop-*.jar"/>
- <include name="dom4j-*.jar"/>
- <include name="xerces-*.jar"/>
- <include name="tika-*.jar"/>
- <include name="apache-solr-*.jar"/>
- <include name="commons-httpclient-*.jar"/>
- <include name="commons-codec-*.jar"/>
- <include name="commons-collections-*.jar"/>
- <include name="commons-beanutils-*.jar"/>
- <include name="commons-cli-*.jar"/>
- <include name="commons-lang-*.jar"/>
- <include name="commons-logging-*.jar"/>
- <include name="log4j-*.jar"/>
- </lib>
- <lib dir="${build.dir}">
- <include name="${final.name}.jar"/>
- </lib>
- <classes dir="${conf.dir}" excludes="**/*.template"/>
- <classes dir="${web.src.dir}/locale"/>
- <classes file="${web.src.dir}/log4j.properties"/>
- <zipfileset prefix="WEB-INF/classes/plugins" dir="${build.plugins}"/>
- <webinf dir="${lib.dir}">
- <include name="taglibs-*.tld"/>
- </webinf>
- </war>
- </target>
-
-
- <!-- ================================================================== -->
<!-- Compile test code -->
<!-- ================================================================== -->
<target name="compile-core-test" depends="compile-core">
@@ -254,9 +185,6 @@
<fileset dir="${basedir}/src">
<include name="java/**/*.java"/>
<include name="plugin/**/*.java"/>
- <!-- Exclude generated sources -->
- <exclude name="**/NutchAnalysis.java" />
- <exclude name="**/NutchAnalysisTokenManager.java" />
</fileset>
</pmd>
<condition property="pmd.stop" value="true">
@@ -337,44 +265,25 @@
<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
- <packageset dir="${plugins.dir}/lib-parsems/src/java"/>
<packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
<packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
- <packageset dir="${plugins.dir}/ontology/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
<packageset dir="${plugins.dir}/protocol-http/src/java"/>
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/parse-ext/src/java"/>
- <packageset dir="${plugins.dir}/parse-html/src/java"/>
<packageset dir="${plugins.dir}/parse-js/src/java"/>
- <packageset dir="${plugins.dir}/parse-text/src/java"/>
- <packageset dir="${plugins.dir}/parse-pdf/src/java"/>
-<!-- <packageset dir="${plugins.dir}/parse-rtf/src/java"/> plugin excluded from build due to licensing issues-->
-<!-- <packageset dir="${plugins.dir}/parse-mp3/src/java"/> plugin excluded from build due to licensing issues-->
- <packageset dir="${plugins.dir}/parse-msexcel/src/java"/>
- <packageset dir="${plugins.dir}/parse-mspowerpoint/src/java"/>
- <packageset dir="${plugins.dir}/parse-msword/src/java"/>
- <packageset dir="${plugins.dir}/parse-oo/src/java"/>
<packageset dir="${plugins.dir}/parse-rss/src/java"/>
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
- <packageset dir="${plugins.dir}/query-basic/src/java"/>
- <packageset dir="${plugins.dir}/query-more/src/java"/>
- <packageset dir="${plugins.dir}/query-site/src/java"/>
- <packageset dir="${plugins.dir}/query-url/src/java"/>
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
- <packageset dir="${plugins.dir}/summary-basic/src/java"/>
- <packageset dir="${plugins.dir}/summary-lucene/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
<packageset dir="${plugins.dir}/creativecommons/src/java"/>
<packageset dir="${plugins.dir}/languageidentifier/src/java"/>
- <packageset dir="${plugins.dir}/clustering-carrot2/src/java"/>
- <packageset dir="${plugins.dir}/ontology/src/java"/>
<link href="${javadoc.link.java}"/>
<link href="${javadoc.link.lucene}"/>
@@ -393,12 +302,7 @@
<group title="URL Filter Plugins" packages="${plugins.urlfilter}"/>
<group title="Scoring Plugins" packages="${plugins.scoring}"/>
<group title="Parse Plugins" packages="${plugins.parse}"/>
- <group title="Analysis Plugins" packages="${plugins.analysis}"/>
<group title="Indexing Filter Plugins" packages="${plugins.index}"/>
- <group title="Query Filter Plugins" packages="${plugins.query}"/>
- <group title="Summary Plugins" packages="${plugins.summary}"/>
- <group title="Clustering Plugins" packages="${plugins.clustering}"/>
- <group title="Ontology Plugins" packages="${plugins.ontology}"/>
<group title="Misc. Plugins" packages="${plugins.misc}"/>
</javadoc>
<!-- Copy the plugin.dtd file to the plugin doc-files dir -->
@@ -411,129 +315,12 @@
includes="nutch-default.xml" style="conf/nutch-conf.xsl"/>
</target>
- <target name="generate-locale" if="doc.locale">
- <echo message="Generating docs for locale=${doc.locale}"/>
-
- <mkdir dir="${build.docs}/${doc.locale}/include"/>
- <xslt in="${docs.src}/include/${doc.locale}/header.xml"
- out="${build.docs}/${doc.locale}/include/header.html"
- style="${docs.src}/style/nutch-header.xsl">
- <xmlcatalog refid="docDTDs"/>
- </xslt>
-
- <dependset>
- <srcfileset dir="${docs.src}/include/${doc.locale}" includes="*.xml"/>
- <srcfileset dir="${docs.src}/style" includes="*.xsl"/>
- <targetfileset dir="${docs.dir}/${doc.locale}" includes="*.html"/>
- </dependset>
-
- <copy file="${docs.src}/style/nutch-page.xsl"
- todir="${build.docs}/${doc.locale}"
- preservelastmodified="true"/>
-
- <xslt basedir="${docs.src}/pages/${doc.locale}"
- destdir="${docs.dir}/${doc.locale}"
- includes="*.xml"
- style="${build.docs}/${doc.locale}/nutch-page.xsl">
- <xmlcatalog refid="docDTDs"/>
- </xslt>
- </target>
-
-
- <target name="generate-docs" depends="init">
- <dependset>
- <srcfileset dir="${docs.src}/include" includes="*.html"/>
- <targetfileset dir="${docs.dir}" includes="**/*.html"/>
- </dependset>
-
- <mkdir dir="${build.docs}/include"/>
- <copy todir="${build.docs}/include">
- <fileset dir="${docs.src}/include"/>
- </copy>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="ca"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="de"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="en"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="es"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="fi"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="fr"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="hu"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="it"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="jp"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="ms"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="nl"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="pl"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="pt"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="sh"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="sr"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="sv"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="th"/>
- </antcall>
-
- <antcall target="generate-locale">
- <param name="doc.locale" value="zh"/>
- </antcall>
-
- <fixcrlf srcdir="${docs.dir}" eol="lf" encoding="utf-8"
- includes="**/*.html"/>
-
- </target>
-
<!-- ================================================================== -->
<!-- D I S T R I B U T I O N -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
- <target name="package" depends="jar, job, war, javadoc">
+ <target name="package" depends="jar, job, javadoc">
<mkdir dir="${dist.dir}"/>
<mkdir dir="${dist.dir}/lib"/>
<mkdir dir="${dist.dir}/bin"/>
@@ -549,13 +336,8 @@
<fileset dir="${build.plugins}"/>
</copy>
- <copy todir="${dist.dir}/webapps">
- <fileset dir="${build.webapps}"/>
- </copy>
-
<copy file="${build.dir}/${final.name}.jar" todir="${dist.dir}"/>
<copy file="${build.dir}/${final.name}.job" todir="${dist.dir}"/>
- <copy file="${build.dir}/${final.name}.war" todir="${dist.dir}"/>
<copy todir="${dist.dir}/bin">
<fileset dir="bin"/>
Modified: nutch/trunk/conf/log4j.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Fri Jul 2 17:19:43 2010
@@ -22,9 +22,6 @@ log4j.logger.org.apache.nutch.segment.Se
log4j.logger.org.apache.nutch.crawl.CrawlDb=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.Indexer=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexMerger=INFO,cmdstdout
log4j.logger.org.apache.nutch=INFO
log4j.logger.org.apache.hadoop=WARN
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jul 2 17:19:43 2010
@@ -769,129 +769,6 @@
</description>
</property>
-
-<!-- analysis properties -->
-
-<property>
- <name>analysis.common.terms.file</name>
- <value>common-terms.utf8</value>
- <description>The name of a file containing a list of common terms
- that should be indexed in n-grams.</description>
-</property>
-
-<!-- searcher properties -->
-
-<property>
- <name>searcher.dir</name>
- <value>crawl</value>
- <description>
- Path to root of crawl. This directory is searched (in
- order) for either the file search-servers.txt, containing a list of
- distributed search servers, or the directory "index" containing
- merged indexes, or the directory "segments" containing segment
- indexes.
- </description>
-</property>
-
-<property>
- <name>searcher.filter.cache.size</name>
- <value>16</value>
- <description>
- Maximum number of filters to cache. Filters can accelerate certain
- field-based queries, like language, document format, etc. Each
- filter requires one bit of RAM per page. So, with a 10 million page
- index, a cache size of 16 consumes two bytes per page, or 20MB.
- </description>
-</property>
-
-<property>
- <name>searcher.filter.cache.threshold</name>
- <value>0.05</value>
- <description>
- Filters are cached when their term is matched by more than this
- fraction of pages. For example, with a threshold of 0.05, and 10
- million pages, the term must match more than 1/20, or 50,000 pages.
- So, if out of 10 million pages, 50% of pages are in English, and 2%
- are in Finnish, then, with a threshold of 0.05, searches for
- "lang:en" will use a cached filter, while searches for "lang:fi"
- will score all 20,000 finnish documents.
- </description>
-</property>
-
-<property>
- <name>searcher.hostgrouping.rawhits.factor</name>
- <value>2.0</value>
- <description>
- A factor that is used to determine the number of raw hits
- initially fetched, before host grouping is done.
- </description>
-</property>
-
-<property>
- <name>searcher.summary.context</name>
- <value>5</value>
- <description>
- The number of context terms to display preceding and following
- matching terms in a hit summary.
- </description>
-</property>
-
-<property>
- <name>searcher.summary.length</name>
- <value>20</value>
- <description>
- The total number of terms to display in a hit summary.
- </description>
-</property>
-
-<property>
- <name>searcher.max.hits</name>
- <value>-1</value>
- <description>If positive, search stops after this many hits are
- found. Setting this to small, positive values (e.g., 1000) can make
- searches much faster. With a sorted index, the quality of the hits
- suffers little.</description>
-</property>
-
-<property>
- <name>searcher.max.time.tick_count</name>
- <value>-1</value>
- <description>If positive value is defined here, limit search time for
- every request to this number of elapsed ticks (see the tick_length
- property below). The total maximum time for any search request will be
- then limited to tick_count * tick_length milliseconds. When search time
- is exceeded, partial results will be returned, and the total number of
- hits will be estimated.
- </description>
-</property>
-
-<property>
- <name>searcher.max.time.tick_length</name>
- <value>200</value>
- <description>The number of milliseconds between ticks. Larger values
- reduce the timer granularity (precision). Smaller values bring more
- overhead.
- </description>
-</property>
-
-<property>
- <name>searcher.num.handlers</name>
- <value>10</value>
- <description>The number of handlers for the distributed search server.
- </description>
-</property>
-
-<property>
- <name>searcher.max.hits.per.page</name>
- <value>1000</value>
- <description> The maximum number of hits to show per page. -1 if
- unlimited. If the number of hits requested by the user (via
- hitsPerPage parameter in the query string) is more than the value
- specified in this property, then this value is assumed as the number
- of hits per page.
- </description>
-</property>
-
<!-- URL normalizer properties -->
<property>
@@ -956,7 +833,7 @@
<property>
<name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-tika|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+ <value>protocol-http|urlfilter-regex|parse-tika|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
@@ -1115,137 +992,6 @@
</description>
</property>
-<!-- clustering extension properties -->
-
-<property>
- <name>extension.clustering.hits-to-cluster</name>
- <value>100</value>
- <description>Number of snippets retrieved for the clustering extension
- if clustering extension is available and user requested results
- to be clustered.</description>
-</property>
-
-<property>
- <name>extension.clustering.extension-name</name>
- <value></value>
- <description>Use the specified online clustering extension. If empty,
- the first available extension will be used. The "name" here refers to an 'id'
- attribute of the 'implementation' element in the plugin descriptor XML
- file.</description>
-</property>
-
-<!-- ontology extension properties -->
-
-<property>
- <name>extension.ontology.extension-name</name>
- <value></value>
- <description>Use the specified online ontology extension. If empty,
- the first available extension will be used. The "name" here refers to an 'id'
- attribute of the 'implementation' element in the plugin descriptor XML
- file.</description>
-</property>
-
-<property>
- <name>extension.ontology.urls</name>
- <value>
- </value>
- <description>Urls of owl files, separated by spaces, such as
- http://www.example.com/ontology/time.owl
- http://www.example.com/ontology/space.owl
- http://www.example.com/ontology/wine.owl
- Or
- file:/ontology/time.owl
- file:/ontology/space.owl
- file:/ontology/wine.owl
- You have to make sure each url is valid.
- By default, there is no owl file, so query refinement based on ontology
- is silently ignored.
- </description>
-</property>
-
-<!-- query-basic plugin properties -->
-
-<property>
- <name>query.url.boost</name>
- <value>4.0</value>
- <description> Used as a boost for url field in Lucene query.
- </description>
-</property>
-
-<property>
- <name>query.anchor.boost</name>
- <value>2.0</value>
- <description> Used as a boost for anchor field in Lucene query.
- </description>
-</property>
-
-<property>
- <name>query.title.boost</name>
- <value>1.5</value>
- <description> Used as a boost for title field in Lucene query.
- </description>
-</property>
-
-<property>
- <name>query.host.boost</name>
- <value>2.0</value>
- <description> Used as a boost for host field in Lucene query.
- </description>
-</property>
-
-<property>
- <name>query.phrase.boost</name>
- <value>1.0</value>
- <description> Used as a boost for phrase in Lucene query.
- Multiplied by boost for field phrase is matched in.
- </description>
-</property>
-
-<!--
-<property>
- <name>query.basic.description.boost</name>
- <value>1.0</value>
- <description> Declares a custom field and its boost to be added to the default fields of the Lucene query.
- </description>
-</property>
--->
-
-<!-- creative-commons plugin properties -->
-
-<property>
- <name>query.cc.boost</name>
- <value>0.0</value>
- <description> Used as a boost for cc field in Lucene query.
- </description>
-</property>
-
-<!-- query-more plugin properties -->
-
-<property>
- <name>query.type.boost</name>
- <value>0.0</value>
- <description> Used as a boost for type field in Lucene query.
- </description>
-</property>
-
-<!-- query-site plugin properties -->
-
-<property>
- <name>query.site.boost</name>
- <value>0.0</value>
- <description> Used as a boost for site field in Lucene query.
- </description>
-</property>
-
-<!-- microformats-reltag plugin properties -->
-
-<property>
- <name>query.tag.boost</name>
- <value>1.0</value>
- <description> Used as a boost for tag field in Lucene query.
- </description>
-</property>
-
<!-- language-identifier plugin properties -->
<property>
@@ -1280,13 +1026,6 @@
</description>
</property>
-<property>
- <name>query.lang.boost</name>
- <value>0.0</value>
- <description> Used as a boost for lang field in Lucene query.
- </description>
-</property>
-
<!-- Temporary Hadoop 0.17.x workaround. -->
<property>
@@ -1300,65 +1039,6 @@
</description>
</property>
-<!-- response writer properties -->
-
-<property>
- <name>search.response.default.type</name>
- <value>xml</value>
- <description>
- The default response type returned if none is specified.
- </description>
-</property>
-
-<property>
- <name>search.response.default.lang</name>
- <value>en</value>
- <description>
- The default response language if none is specified.
- </description>
-</property>
-
-<property>
- <name>search.response.default.numrows</name>
- <value>10</value>
- <description>
- The default number of rows to return if none is specified.
- </description>
-</property>
-
-<property>
- <name>search.response.default.dedupfield</name>
- <value>site</value>
- <description>
- The default dedup field if none is specified.
- </description>
-</property>
-
-<property>
- <name>search.response.default.numdupes</name>
- <value>1</value>
- <description>
- The default number of duplicates returned if none is specified.
- </description>
-</property>
-
-<property>
- <name>searcher.response.maxage</name>
- <value>86400</value>
- <description>
- The maxage of a response in seconds. Used in caching headers.
- </description>
-</property>
-
-<property>
- <name>searcher.response.prettyprint</name>
- <value>true</value>
- <description>
- Should the response output be pretty printed. Setting to true enables better
- debugging, false removes unneeded spaces and gives better throughput.
- </description>
-</property>
-
<!-- solr index properties -->
<property>
<name>solrindex.mapping.file</name>
Modified: nutch/trunk/default.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Fri Jul 2 17:19:43 2010
@@ -9,16 +9,11 @@ src.dir = ./src/java
lib.dir = ./lib
conf.dir = ./conf
plugins.dir = ./src/plugin
-docs.dir = ./docs
-docs.src = ${basedir}/src/web
-xmlcatalog.dir = ${basedir}/src/xmlcatalog
build.dir = ./build
build.classes = ${build.dir}/classes
-build.webapps = ${build.dir}/webapps
build.plugins = ${build.dir}/plugins
-build.docs = ${build.dir}/docs
-build.javadoc = ${build.docs}/api
+build.javadoc = ${build.dir}/docs/api
build.encoding = UTF-8
test.src.dir = ./src/test
@@ -29,9 +24,6 @@ test.build.javadoc = ${test.build.dir}/d
javacc.home=/usr/java/javacc
-web.src.dir = ./src/web
-src.webapps = ./src/webapps
-
# Proxy Host and Port to use for building JavaDoc
javadoc.proxy.host=-J-DproxyHost=
javadoc.proxy.port=-J-DproxyPort=
@@ -45,7 +37,7 @@ dist.dir=${build.dir}/${final.name}
javac.debug=on
javac.optimize=on
javac.deprecation=off
-javac.version= 1.5
+javac.version= 1.6
#
# Plugins API
@@ -89,13 +81,6 @@ plugins.parse=\
org.apache.nutch.parse.zip
#
-# Analysis Plugins
-#
-plugins.analysis=\
-# ${plugin.analysis-de}:\
-# ${plugin.analysis-fr}
-
-#
# Indexing Filter Plugins
#
plugins.index=\
@@ -112,25 +97,6 @@ plugins.query=\
org.apache.nutch.searcher.url*
#
-# Ontology Plugins
-#
-plugins.ontology=\
- org.apache.nutch.ontology.jena*
-
-#
-# Online Clusterer Plugins
-#
-plugins.clustering=\
- org.apache.nutch.clustering.carrot2*
-
-#
-# Summary Plugins
-#
-plugins.summary=\
- org.apache.nutch.summary.basic*:\
- org.apache.nutch.summary.lucene*
-
-#
# Misc. Plugins
#
# (gathers plugins that cannot be dispatched
@@ -138,6 +104,5 @@ plugins.summary=\
# many extension points)
#
plugins.misc=\
- org.apache.nutch.analysis.lang*:\
org.apache.nutch.microformats.reltag*:\
org.creativecommons.nutch*
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Jul 2 17:19:43 2010
@@ -29,9 +29,7 @@ import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.parse.ParseSegment;
-import org.apache.nutch.indexer.DeleteDuplicates;
-import org.apache.nutch.indexer.IndexMerger;
-import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
import org.apache.nutch.indexer.solr.SolrIndexer;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
@@ -52,8 +50,7 @@ public class Crawl {
public static void main(String args[]) throws Exception {
if (args.length < 1) {
System.out.println
- ("Usage: Crawl <urlDir> [-dir d] [-threads n] [-depth i] [-topN N]" +
- " [-solr solrURL]");
+ ("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
return;
}
@@ -65,7 +62,6 @@ public class Crawl {
int threads = job.getInt("fetcher.threads.fetch", 10);
int depth = 5;
long topN = Long.MAX_VALUE;
- String indexerName = "lucene";
String solrUrl = null;
for (int i = 0; i < args.length; i++) {
@@ -82,15 +78,17 @@ public class Crawl {
topN = Integer.parseInt(args[i+1]);
i++;
} else if ("-solr".equals(args[i])) {
- indexerName = "solr";
solrUrl = StringUtils.lowerCase(args[i + 1]);
i++;
} else if (args[i] != null) {
rootUrlDir = new Path(args[i]);
}
}
+
+ if (solrUrl == null) {
+ LOG.warn("solrUrl is not set, indexing will be skipped...");
+ }
- boolean isSolrIndex = StringUtils.equalsIgnoreCase(indexerName, "solr");
FileSystem fs = FileSystem.get(job);
if (LOG.isInfoEnabled()) {
@@ -98,10 +96,7 @@ public class Crawl {
LOG.info("rootUrlDir = " + rootUrlDir);
LOG.info("threads = " + threads);
LOG.info("depth = " + depth);
- LOG.info("indexer=" + indexerName);
- if (isSolrIndex) {
- LOG.info("solrUrl=" + solrUrl);
- }
+ LOG.info("solrUrl=" + solrUrl);
if (topN != Long.MAX_VALUE)
LOG.info("topN = " + topN);
}
@@ -139,41 +134,16 @@ public class Crawl {
if (i > 0) {
linkDbTool.invert(linkDb, segments, true, true, false); // invert links
- // index, dedup & merge
- FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
- if (isSolrIndex) {
+ if (solrUrl != null) {
+ // index, dedup & merge
+ FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
SolrIndexer indexer = new SolrIndexer(conf);
indexer.indexSolr(solrUrl, crawlDb, linkDb,
- Arrays.asList(HadoopFSUtil.getPaths(fstats)));
+ Arrays.asList(HadoopFSUtil.getPaths(fstats)));
+ SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
+ dedup.setConf(conf);
+ dedup.dedup(solrUrl);
}
- else {
-
- DeleteDuplicates dedup = new DeleteDuplicates(conf);
- if(indexes != null) {
- // Delete old indexes
- if (fs.exists(indexes)) {
- LOG.info("Deleting old indexes: " + indexes);
- fs.delete(indexes, true);
- }
-
- // Delete old index
- if (fs.exists(index)) {
- LOG.info("Deleting old merged index: " + index);
- fs.delete(index, true);
- }
- }
-
- Indexer indexer = new Indexer(conf);
- indexer.index(indexes, crawlDb, linkDb,
- Arrays.asList(HadoopFSUtil.getPaths(fstats)));
-
- IndexMerger merger = new IndexMerger(conf);
- if(indexes != null) {
- dedup.dedup(new Path[] { indexes });
- fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
- merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
- }
- }
} else {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Fri Jul 2 17:19:43 2010
@@ -45,9 +45,6 @@ public class NutchWritable extends Gener
org.apache.nutch.parse.ParseStatus.class,
org.apache.nutch.protocol.Content.class,
org.apache.nutch.protocol.ProtocolStatus.class,
- org.apache.nutch.searcher.Hit.class,
- org.apache.nutch.searcher.HitDetails.class,
- org.apache.nutch.searcher.Hits.class
};
}
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jul 2 17:19:43 2010
@@ -152,7 +152,7 @@ implements Mapper<Text, Writable, Text,
return;
}
// apply boost to all indexed fields.
- doc.setScore(boost);
+ doc.setWeight(boost);
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -52,13 +52,4 @@ public interface IndexingFilter extends
*/
NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException;
-
- /** Adds index-level configuraition options.
- * Implementations can update given configuration to pass document-independent
- * information to indexing backends. As a rule of thumb, prefix meta keys
- * with the name of the backend intended. For example, when
- * passing information to lucene backend, prefix keys with "lucene.".
- * @param conf Configuration instance.
- * */
- public void addIndexBackendOptions(Configuration conf);
}
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Fri Jul 2 17:19:43 2010
@@ -70,7 +70,6 @@ public class IndexingFilters {
.getExtensionInstance();
LOG.info("Adding " + filter.getClass().getName());
if (!filterMap.containsKey(filter.getClass().getName())) {
- filter.addIndexBackendOptions(conf);
filterMap.put(filter.getClass().getName(), filter);
}
}
@@ -89,7 +88,6 @@ public class IndexingFilters {
IndexingFilter filter = filterMap
.get(orderedFilters[i]);
if (filter != null) {
- filter.addIndexBackendOptions(conf);
filters.add(filter);
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java Fri Jul 2 17:19:43 2010
@@ -19,11 +19,9 @@ package org.apache.nutch.indexer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
-import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -35,51 +33,48 @@ import org.apache.nutch.metadata.Metadat
/** A {@link NutchDocument} is the unit of indexing.*/
public class NutchDocument
-implements Writable, Iterable<Entry<String, List<String>>> {
+implements Writable, Iterable<Entry<String, NutchField>> {
- public static final byte VERSION = 1;
-
- private Map<String, List<String>> fields;
+ public static final byte VERSION = 2;
+
+ private Map<String, NutchField> fields;
private Metadata documentMeta;
- private float score;
+ private float weight;
public NutchDocument() {
- fields = new HashMap<String, List<String>>();
+ fields = new HashMap<String, NutchField>();
documentMeta = new Metadata();
- score = 0.0f;
+ weight = 1.0f;
}
- public void add(String name, String value) {
- List<String> fieldValues = fields.get(name);
- if (fieldValues == null) {
- fieldValues = new ArrayList<String>();
+ public void add(String name, Object value) {
+ NutchField field = fields.get(name);
+ if (field == null) {
+ field = new NutchField(value);
+ fields.put(name, field);
+ } else {
+ field.add(value);
}
- fieldValues.add(value);
- fields.put(name, fieldValues);
- }
-
- private void addFieldUnprotected(String name, String value) {
- fields.get(name).add(value);
}
- public String getFieldValue(String name) {
- List<String> fieldValues = fields.get(name);
- if (fieldValues == null) {
+ public Object getFieldValue(String name) {
+ NutchField field = fields.get(name);
+ if (field == null) {
return null;
}
- if (fieldValues.size() == 0) {
+ if (field.getValues().size() == 0) {
return null;
}
- return fieldValues.get(0);
+ return field.getValues().get(0);
}
- public List<String> getFieldValues(String name) {
+ public NutchField getField(String name) {
return fields.get(name);
}
- public List<String> removeField(String name) {
+ public NutchField removeField(String name) {
return fields.remove(name);
}
@@ -88,16 +83,16 @@ implements Writable, Iterable<Entry<Stri
}
/** Iterate over all fields. */
- public Iterator<Entry<String, List<String>>> iterator() {
+ public Iterator<Entry<String, NutchField>> iterator() {
return fields.entrySet().iterator();
}
- public float getScore() {
- return score;
+ public float getWeight() {
+ return weight;
}
- public void setScore(float score) {
- this.score = score;
+ public void setWeight(float weight) {
+ this.weight = weight;
}
public Metadata getDocumentMeta() {
@@ -105,6 +100,7 @@ implements Writable, Iterable<Entry<Stri
}
public void readFields(DataInput in) throws IOException {
+ fields.clear();
byte version = in.readByte();
if (version != VERSION) {
throw new VersionMismatchException(VERSION, version);
@@ -112,30 +108,23 @@ implements Writable, Iterable<Entry<Stri
int size = WritableUtils.readVInt(in);
for (int i = 0; i < size; i++) {
String name = Text.readString(in);
- int numValues = WritableUtils.readVInt(in);
- fields.put(name, new ArrayList<String>());
- for (int j = 0; j < numValues; j++) {
- String value = Text.readString(in);
- addFieldUnprotected(name, value);
- }
+ NutchField field = new NutchField();
+ field.readFields(in);
+ fields.put(name, field);
}
- score = in.readFloat();
+ weight = in.readFloat();
documentMeta.readFields(in);
}
public void write(DataOutput out) throws IOException {
out.writeByte(VERSION);
WritableUtils.writeVInt(out, fields.size());
- for (Map.Entry<String, List<String>> entry : fields.entrySet()) {
+ for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
Text.writeString(out, entry.getKey());
- List<String> values = entry.getValue();
- WritableUtils.writeVInt(out, values.size());
- for (String value : values) {
- Text.writeString(out, value);
- }
+ NutchField field = entry.getValue();
+ field.write(out);
}
- out.writeFloat(score);
+ out.writeFloat(weight);
documentMeta.write(out);
}
-
}
Added: nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java?rev=960064&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java Fri Jul 2 17:19:43 2010
@@ -0,0 +1,64 @@
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents a multi-valued field with a weight. Values are arbitrary
+ * objects.
+ */
+public class NutchField implements Writable {
+ private float weight;
+ private List<Object> values = new ArrayList<Object>();
+
+ public NutchField() {
+
+ }
+
+ public NutchField(Object value) {
+ this(value, 1.0f);
+ }
+
+ public NutchField(Object value, float weight) {
+ this.weight = weight;
+ if (value instanceof Collection) {
+ values.addAll((Collection<Object>)value);
+ } else {
+ values.add(value);
+ }
+ }
+
+ public void add(Object value) {
+ values.add(value);
+ }
+
+ public float getWeight() {
+ return weight;
+ }
+
+ public void setWeight(float weight) {
+ this.weight = weight;
+ }
+
+ public List<Object> getValues() {
+ return values;
+ }
+
+ public void reset() {
+ weight = 1.0f;
+ values.clear();
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ }
+
+ public void write(DataOutput out) throws IOException {
+ }
+
+}
Propchange: nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Fri Jul 2 17:19:43 2010
@@ -39,7 +39,6 @@ import org.apache.hadoop.mapred.lib.Iden
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.solr.client.solrj.SolrQuery;
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Fri Jul 2 17:19:43 2010
@@ -23,6 +23,7 @@ import java.util.Map.Entry;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
import org.apache.nutch.indexer.NutchIndexWriter;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
@@ -47,16 +48,16 @@ public class SolrWriter implements Nutch
public void write(NutchDocument doc) throws IOException {
final SolrInputDocument inputDoc = new SolrInputDocument();
- for(final Entry<String, List<String>> e : doc) {
- for (final String val : e.getValue()) {
- inputDoc.addField(solrMapping.mapKey(e.getKey()), val);
+ for(final Entry<String, NutchField> e : doc) {
+ for (final Object val : e.getValue().getValues()) {
+ inputDoc.addField(solrMapping.mapKey(e.getKey()), val, e.getValue().getWeight());
String sCopy = solrMapping.mapCopyKey(e.getKey());
if (sCopy != e.getKey()) {
inputDoc.addField(sCopy, val);
}
}
}
- inputDoc.setDocumentBoost(doc.getScore());
+ inputDoc.setDocumentBoost(doc.getWeight());
inputDocs.add(inputDoc);
if (inputDocs.size() > commitSize) {
try {
Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Fri Jul 2 17:19:43 2010
@@ -26,14 +26,11 @@
<!-- Build & deploy all the plugin jars. -->
<!-- ====================================================== -->
<target name="deploy">
- <ant dir="clustering-carrot2" target="deploy"/>
<ant dir="creativecommons" target="deploy"/>
<ant dir="feed" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-more" target="deploy"/>
- <ant dir="field-basic" target="deploy"/>
- <ant dir="field-boost" target="deploy"/>
<ant dir="languageidentifier" target="deploy"/>
<ant dir="lib-http" target="deploy"/>
<ant dir="lib-lucene-analyzers" target="deploy"/>
@@ -42,7 +39,6 @@
<ant dir="lib-xml" target="deploy"/>
<ant dir="microformats-reltag" target="deploy"/>
<ant dir="nutch-extensionpoints" target="deploy"/>
- <ant dir="ontology" target="deploy"/>
<ant dir="protocol-file" target="deploy"/>
<ant dir="protocol-ftp" target="deploy"/>
<ant dir="protocol-http" target="deploy"/>
@@ -53,18 +49,9 @@
<ant dir="parse-swf" target="deploy"/>
<ant dir="parse-tika" target="deploy"/>
<ant dir="parse-zip" target="deploy"/>
- <ant dir="query-basic" target="deploy"/>
- <ant dir="query-more" target="deploy"/>
- <ant dir="query-site" target="deploy"/>
- <ant dir="query-custom" target="deploy"/>
- <ant dir="query-url" target="deploy"/>
- <ant dir="response-json" target="deploy"/>
- <ant dir="response-xml" target="deploy"/>
<ant dir="scoring-opic" target="deploy"/>
<ant dir="scoring-link" target="deploy"/>
- <ant dir="summary-basic" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
- <ant dir="summary-lucene" target="deploy"/>
<ant dir="tld" target="deploy"/>
<ant dir="urlfilter-automaton" target="deploy"/>
<ant dir="urlfilter-domain" target="deploy" />
@@ -86,7 +73,6 @@
<ant dir="index-more" target="test"/>
<ant dir="languageidentifier" target="test"/>
<ant dir="lib-http" target="test"/>
- <ant dir="ontology" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
<!--ant dir="parse-ext" target="test"/-->
<ant dir="parse-rss" target="test"/>
@@ -94,7 +80,6 @@
<ant dir="parse-swf" target="test"/>
<ant dir="parse-tika" target="test"/>
<ant dir="parse-zip" target="test"/>
- <ant dir="query-url" target="test"/>
<ant dir="subcollection" target="test"/>
<ant dir="urlfilter-automaton" target="test"/>
<ant dir="urlfilter-domain" target="test" />
@@ -110,16 +95,11 @@
<!-- Clean all of the plugins. -->
<!-- ====================================================== -->
<target name="clean">
- <ant dir="analysis-de" target="clean"/>
- <ant dir="analysis-fr" target="clean"/>
- <ant dir="clustering-carrot2" target="clean"/>
<ant dir="creativecommons" target="clean"/>
<ant dir="feed" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-anchor" target="clean"/>
<ant dir="index-more" target="clean"/>
- <ant dir="field-basic" target="clean"/>
- <ant dir="field-boost" target="clean"/>
<ant dir="languageidentifier" target="clean"/>
<ant dir="lib-commons-httpclient" target="clean"/>
<ant dir="lib-http" target="clean"/>
@@ -129,7 +109,6 @@
<ant dir="lib-xml" target="clean"/>
<ant dir="microformats-reltag" target="clean"/>
<ant dir="nutch-extensionpoints" target="clean"/>
- <ant dir="ontology" target="clean"/>
<ant dir="protocol-file" target="clean"/>
<ant dir="protocol-ftp" target="clean"/>
<ant dir="protocol-http" target="clean"/>
@@ -140,18 +119,9 @@
<ant dir="parse-swf" target="clean"/>
<ant dir="parse-tika" target="clean"/>
<ant dir="parse-zip" target="clean"/>
- <ant dir="query-basic" target="clean"/>
- <ant dir="query-more" target="clean"/>
- <ant dir="query-site" target="clean"/>
- <ant dir="query-url" target="clean"/>
- <ant dir="query-custom" target="clean"/>
- <ant dir="response-json" target="clean"/>
- <ant dir="response-xml" target="clean"/>
<ant dir="scoring-opic" target="clean"/>
<ant dir="scoring-link" target="clean"/>
<ant dir="subcollection" target="clean"/>
- <ant dir="summary-basic" target="clean"/>
- <ant dir="summary-lucene" target="clean"/>
<ant dir="tld" target="clean"/>
<ant dir="urlfilter-automaton" target="clean"/>
<ant dir="urlfilter-domain" target="clean" />
Modified: nutch/trunk/src/plugin/creativecommons/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/plugin.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/plugin.xml (original)
+++ nutch/trunk/src/plugin/creativecommons/plugin.xml Fri Jul 2 17:19:43 2010
@@ -45,13 +45,4 @@
class="org.creativecommons.nutch.CCIndexingFilter"/>
</extension>
- <extension id="org.creativecommons.nutch.CCQueryFilter"
- name="Creative Commmons Query Filter"
- point="org.apache.nutch.searcher.QueryFilter">
- <implementation id="CCQueryFilter"
- class="org.creativecommons.nutch.CCQueryFilter">
- <parameter name="fields" value="cc"/>
- </implementation>
- </extension>
-
</plugin>
Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -24,7 +24,6 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -111,10 +110,6 @@ public class CCIndexingFilter implements
doc.add(FIELD, feature);
}
- public void addIndexBackendOptions(Configuration conf) {
- LuceneWriter.addFieldOptions(FIELD, LuceneWriter.STORE.YES, LuceneWriter.INDEX.UNTOKENIZED, conf);
- }
-
public void setConf(Configuration conf) {
this.conf = conf;
}
Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -30,7 +30,6 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.metadata.Feed;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
@@ -122,24 +121,6 @@ public class FeedIndexingFilter implemen
return conf;
}
- public void addIndexBackendOptions(Configuration conf) {
- LuceneWriter.addFieldOptions(Feed.FEED_AUTHOR,
- LuceneWriter.STORE.YES, LuceneWriter.INDEX.TOKENIZED, conf);
-
- LuceneWriter.addFieldOptions(Feed.FEED_TAGS,
- LuceneWriter.STORE.YES, LuceneWriter.INDEX.TOKENIZED, conf);
-
- LuceneWriter.addFieldOptions(Feed.FEED,
- LuceneWriter.STORE.YES, LuceneWriter.INDEX.TOKENIZED, conf);
-
- LuceneWriter.addFieldOptions(PUBLISHED_DATE,
- LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO_NORMS, conf);
-
- LuceneWriter.addFieldOptions(UPDATED_DATE,
- LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO_NORMS, conf);
-
- }
-
/**
* Sets the {@link Configuration} object used to configure this
* {@link IndexingFilter}.
Modified: nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -25,7 +25,6 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.parse.Parse;
/**
@@ -57,9 +56,4 @@ public class AnchorIndexingFilter
return doc;
}
- public void addIndexBackendOptions(Configuration conf) {
- LuceneWriter.addFieldOptions("anchor", LuceneWriter.STORE.NO,
- LuceneWriter.INDEX.TOKENIZED, conf);
- }
-
}
Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -28,7 +28,6 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -94,40 +93,6 @@ public class BasicIndexingFilter impleme
return doc;
}
- public void addIndexBackendOptions(Configuration conf) {
-
- ///////////////////////////
- // add lucene options //
- ///////////////////////////
-
- // host is un-stored, indexed and tokenized
- LuceneWriter.addFieldOptions("host", LuceneWriter.STORE.NO,
- LuceneWriter.INDEX.TOKENIZED, conf);
-
- // site is un-stored, indexed and un-tokenized
- LuceneWriter.addFieldOptions("site", LuceneWriter.STORE.NO,
- LuceneWriter.INDEX.UNTOKENIZED, conf);
-
- // url is both stored and indexed, so it's both searchable and returned
- LuceneWriter.addFieldOptions("url", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.TOKENIZED, conf);
-
- // content is indexed, so that it's searchable, but not stored in index
- LuceneWriter.addFieldOptions("content", LuceneWriter.STORE.NO,
- LuceneWriter.INDEX.TOKENIZED, conf);
-
- // anchors are indexed, so they're searchable, but not stored in index
- LuceneWriter.addFieldOptions("anchor", LuceneWriter.STORE.NO,
- LuceneWriter.INDEX.TOKENIZED, conf);
-
- // title is indexed and stored so that it can be displayed
- LuceneWriter.addFieldOptions("title", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.TOKENIZED, conf);
-
- LuceneWriter.addFieldOptions("cache", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf);
- LuceneWriter.addFieldOptions("tstamp", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf);
- }
-
public void setConf(Configuration conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -38,7 +38,6 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -287,32 +286,6 @@ public class MoreIndexingFilter implemen
return doc;
}
- public void addIndexBackendOptions(Configuration conf) {
-
- ///////////////////////////
- // add lucene options //
- ///////////////////////////
-
- LuceneWriter.addFieldOptions("type", LuceneWriter.STORE.NO,
- LuceneWriter.INDEX.UNTOKENIZED, conf);
-
- // primaryType and subType are stored, indexed and un-tokenized
- LuceneWriter.addFieldOptions("primaryType", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.UNTOKENIZED, conf);
- LuceneWriter.addFieldOptions("subType", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.UNTOKENIZED, conf);
-
- LuceneWriter.addFieldOptions("contentLength", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.NO, conf);
-
- LuceneWriter.addFieldOptions("lastModified", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.NO, conf);
-
- // un-stored, indexed and un-tokenized
- LuceneWriter.addFieldOptions("date", LuceneWriter.STORE.NO,
- LuceneWriter.INDEX.UNTOKENIZED, conf);
- }
-
public void setConf(Configuration conf) {
this.conf = conf;
MIME = new MimeUtil(conf);
Modified: nutch/trunk/src/plugin/languageidentifier/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/plugin.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/plugin.xml (original)
+++ nutch/trunk/src/plugin/languageidentifier/plugin.xml Fri Jul 2 17:19:43 2010
@@ -45,16 +45,5 @@
class="org.apache.nutch.analysis.lang.LanguageIndexingFilter"/>
</extension>
-
- <extension id="org.apache.nutch.analysis.lang.LanguageQueryFilter"
- name="Nutch Language Query Filter"
- point="org.apache.nutch.searcher.QueryFilter">
- <implementation id="LanguageQueryFilter"
- class="org.apache.nutch.analysis.lang.LanguageQueryFilter">
- <parameter name="raw-fields" value="lang"/>
- </implementation>
- </extension>
-
-
</plugin>
Modified: nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -23,7 +23,6 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.metadata.Metadata;
@@ -95,11 +94,6 @@ public class LanguageIndexingFilter impl
return doc;
}
- public void addIndexBackendOptions(Configuration conf) {
- LuceneWriter.addFieldOptions("lang", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.UNTOKENIZED, conf);
- }
-
public void setConf(Configuration conf) {
this.conf = conf;
this.languageIdentifier = new LanguageIdentifier(conf);
Modified: nutch/trunk/src/plugin/microformats-reltag/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/plugin.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/microformats-reltag/plugin.xml (original)
+++ nutch/trunk/src/plugin/microformats-reltag/plugin.xml Fri Jul 2 17:19:43 2010
@@ -45,17 +45,5 @@
class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
</extension>
-
- <extension id="org.apache.nutch.microformats.reltag.RelTagQueryFilter"
- name="Rel-Tag query filter"
- point="org.apache.nutch.searcher.QueryFilter">
- <implementation id="RelTagQueryFilter"
- class="org.apache.nutch.microformats.reltag.RelTagQueryFilter">
- <parameter name="raw-fields" value="tag"/>
- </implementation>
-
- </extension>
-
-
</plugin>
Modified: nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -23,7 +23,6 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;
import org.apache.nutch.parse.Parse;
@@ -60,11 +59,6 @@ public class RelTagIndexingFilter implem
return doc;
}
- public void addIndexBackendOptions(Configuration conf) {
- LuceneWriter.addFieldOptions("tag", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.UNTOKENIZED, conf);
- }
-
/* ----------------------------- *
* <implementation:Configurable> *
* ----------------------------- */
Modified: nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original)
+++ nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Fri Jul 2 17:19:43 2010
@@ -25,22 +25,10 @@
Please not that plugins can define extension points as well to be extendable.-->
<extension-point
- id="org.apache.nutch.clustering.OnlineClusterer"
- name="Nutch Online Search Results Clustering Plugin"/>
-
-<extension-point
- id="org.apache.nutch.indexer.field.FieldFilter"
- name="Nutch Field Filter"/>
-
-<extension-point
id="org.apache.nutch.indexer.IndexingFilter"
name="Nutch Indexing Filter"/>
<extension-point
- id="org.apache.nutch.ontology.Ontology"
- name="Ontology Model Loader"/>
-
-<extension-point
id="org.apache.nutch.parse.Parser"
name="Nutch Content Parser"/>
@@ -53,10 +41,6 @@
name="Nutch Protocol"/>
<extension-point
- id="org.apache.nutch.searcher.QueryFilter"
- name="Nutch Query Filter"/>
-
-<extension-point
id="org.apache.nutch.net.URLFilter"
name="Nutch URL Filter"/>
@@ -65,18 +49,6 @@
name="Nutch URL Normalizer"/>
<extension-point
- id="org.apache.nutch.analysis.NutchAnalyzer"
- name="Nutch Analysis"/>
-
-<extension-point
- id="org.apache.nutch.searcher.response.ResponseWriter"
- name="Nutch Search Results Response Writer"/>
-
-<extension-point
- id="org.apache.nutch.searcher.Summarizer"
- name="Nutch Summarizer"/>
-
-<extension-point
id="org.apache.nutch.scoring.ScoringFilter"
name="Nutch Scoring"/>
Modified: nutch/trunk/src/plugin/subcollection/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/plugin.xml (original)
+++ nutch/trunk/src/plugin/subcollection/plugin.xml Fri Jul 2 17:19:43 2010
@@ -31,16 +31,6 @@
</library>
</runtime>
- <extension id="org.apache.nutch.searcher.subcollection.query"
- name="Subcollection Query Filter"
- point="org.apache.nutch.searcher.QueryFilter">
- <implementation id="SubcollectionQueryFilter"
- class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter">
- <parameter name="raw-fields" value="subcollection"/>
- </implementation>
-
- </extension>
-
<extension id="org.apache.nutch.indexer.subcollection.indexing"
name="Subcollection Indexing Filter"
point="org.apache.nutch.indexer.IndexingFilter">
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -29,7 +29,6 @@ import org.apache.nutch.util.NutchConfig
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.collection.CollectionManager;
import org.apache.nutch.crawl.CrawlDatum;
@@ -72,9 +71,4 @@ public class SubcollectionIndexingFilter
addSubCollectionField(doc, sUrl);
return doc;
}
-
- public void addIndexBackendOptions(Configuration conf) {
- LuceneWriter.addFieldOptions(FIELD_NAME, LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.TOKENIZED, conf);
- }
}
Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java Fri Jul 2 17:19:43 2010
@@ -28,7 +28,6 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.domain.DomainSuffix;
@@ -65,10 +64,4 @@ public class TLDIndexingFilter implement
public Configuration getConf() {
return this.conf;
}
-
- public void addIndexBackendOptions(Configuration conf) {
- // store, no index
- LuceneWriter.addFieldOptions("tld", LuceneWriter.STORE.YES,
- LuceneWriter.INDEX.NO, conf);
- }
}
Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java?rev=960064&r1=960063&r2=960064&view=diff
==============================================================================
--- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java (original)
+++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java Fri Jul 2 17:19:43 2010
@@ -26,6 +26,7 @@ import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
@@ -52,12 +53,12 @@ public class TLDScoringFilter implements
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
- List<String> tlds = doc.getFieldValues("tld");
+ NutchField tlds = doc.getField("tld");
float boost = 1.0f;
if(tlds != null) {
- for(String tld : tlds) {
- DomainSuffix entry = tldEntries.get(tld);
+ for(Object tld : tlds.getValues()) {
+ DomainSuffix entry = tldEntries.get(tld.toString());
if(entry != null)
boost *= entry.getBoost();
}