You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/03/23 23:16:31 UTC
svn commit: r158845 - in incubator/nutch/trunk: ./
src/java/org/apache/nutch/analysis/
src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/
src/plugin/query-site/
src/plugin/query-site/src/java/org/apache/nutch/searcher/site/
Author: cutting
Date: Wed Mar 23 14:16:29 2005
New Revision: 158845
URL: http://svn.apache.org/viewcvs?view=rev&rev=158845
Log:
Index host and title in separate fields.
Removed:
incubator/nutch/trunk/src/plugin/query-site/src/java/org/apache/nutch/searcher/site/SiteIndexingFilter.java
Modified:
incubator/nutch/trunk/CHANGES.txt
incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
incubator/nutch/trunk/src/plugin/query-site/plugin.xml
Modified: incubator/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/CHANGES.txt?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/CHANGES.txt (original)
+++ incubator/nutch/trunk/CHANGES.txt Wed Mar 23 14:16:29 2005
@@ -30,6 +30,16 @@
redistribution by Apache. Disabled compilation of plugins which
require these libraries. (Doug Cutting 20050301)
+ 6. Index host and title in separate fields. Host was indexed
+ previously only as a part of the URL. Title was indexed as an
+ anchor. Now boosts for matching these fields may be adjusted
+ separately from boosts for matching anchors and url. Also: move
+ site indexing to index-basic plugin to minimize the number of
+ times the URL needs to be parsed; and, stop using anchor analyzer
+ for anything but anchors. (Piotr Kosiorowski via Doug Cutting
+ 20050323)
+
+
Release 0.6
1. Added clustering-carrot2 plugin, together with introduction of clustering
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Wed Mar 23 14:16:29 2005
@@ -77,7 +77,7 @@
/** Returns a new token stream for text from the named field. */
public TokenStream tokenStream(String fieldName, Reader reader) {
Analyzer analyzer;
- if ("url".equals(fieldName) || ("anchor".equals(fieldName)))
+ if ("anchor".equals(fieldName))
analyzer = ANCHOR_ANALYZER;
else
analyzer = CONTENT_ANALYZER;
Modified: incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Wed Mar 23 14:16:29 2005
@@ -27,6 +27,8 @@
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.pagedb.FetchListEntry;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
@@ -43,6 +45,21 @@
throws IndexingException {
String url = fo.getUrl().toString();
+ String host = null;
+ try {
+ URL u = new URL(url);
+ host = u.getHost();
+ } catch (MalformedURLException e) {
+ throw new IndexingException(e);
+ }
+
+ if (host != null) {
+ // add host as un-stored, indexed and tokenized
+ doc.add(Field.UnStored("host", host));
+ // add site as un-stored, indexed and un-tokenized
+ doc.add(new Field("site", host, false, true, false));
+ }
+
// url is both stored and indexed, so it's both searchable and returned
doc.add(Field.Text("url", url));
@@ -61,10 +78,8 @@
if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
title = title.substring(0, MAX_TITLE_LENGTH);
}
- // add title as anchor so it is searchable. doesn't warrant its own field.
- doc.add(Field.UnStored("anchor", title));
- // add title unindexed, so that it can be displayed
- doc.add(Field.UnIndexed("title", title));
+ // add title indexed and stored so that it can be displayed
+ doc.add(Field.Text("title", title));
return doc;
}
Modified: incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java (original)
+++ incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Wed Mar 23 14:16:29 2005
@@ -36,15 +36,22 @@
private static float URL_BOOST = 4.0f;
private static float ANCHOR_BOOST = 2.0f;
+ private static float TITLE_BOOST = 1.5f;
+ private static float HOST_BOOST = 2.0f;
private static int SLOP = Integer.MAX_VALUE;
private static float PHRASE_BOOST = 1.0f;
- private static final String[] FIELDS = {"url", "anchor", "content"};
- private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST, 1.0f};
+ private static final String[] FIELDS =
+ { "url", "anchor", "content", "title", "host" };
- /** Set the boost factor for url matches, relative to content and anchor
- * matches */
+ private static final float[] FIELD_BOOSTS =
+ { URL_BOOST, ANCHOR_BOOST, 1.0f, TITLE_BOOST, HOST_BOOST };
+
+ /**
+ * Set the boost factor for url matches, relative to content and anchor
+ * matches
+ */
public static void setUrlBoost(float boost) { URL_BOOST = boost; }
/** Set the boost factor for title/anchor matches, relative to url and
Modified: incubator/nutch/trunk/src/plugin/query-site/plugin.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/query-site/plugin.xml?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/plugin/query-site/plugin.xml (original)
+++ incubator/nutch/trunk/src/plugin/query-site/plugin.xml Wed Mar 23 14:16:29 2005
@@ -6,10 +6,6 @@
provider-name="nutch.org">
<extension-point
- id="org.apache.nutch.indexer.IndexingFilter"
- name="Nutch Indexing Filter"/>
-
- <extension-point
id="org.apache.nutch.searcher.QueryFilter"
name="Nutch Query Filter"/>
@@ -18,14 +14,6 @@
<export name="*"/>
</library>
</runtime>
-
- <extension id="org.apache.nutch.search.site.SiteIndexingFilter"
- name="Nutch Site Indexing Filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="SiteIndexingFilter"
- class="org.apache.nutch.searcher.site.SiteIndexingFilter"/>
- </extension>
-
<extension id="org.apache.nutch.searcher.site.SiteQueryFilter"
name="Nutch Site Query Filter"