You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/03/23 23:16:31 UTC

svn commit: r158845 - in incubator/nutch/trunk: ./ src/java/org/apache/nutch/analysis/ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/ src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/ src/plugin/query-site/ src/plugin/query-site/src/java/org/apache/nutch/searcher/site/

Author: cutting
Date: Wed Mar 23 14:16:29 2005
New Revision: 158845

URL: http://svn.apache.org/viewcvs?view=rev&rev=158845
Log:
Index host and title in separate fields.

Removed:
    incubator/nutch/trunk/src/plugin/query-site/src/java/org/apache/nutch/searcher/site/SiteIndexingFilter.java
Modified:
    incubator/nutch/trunk/CHANGES.txt
    incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
    incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
    incubator/nutch/trunk/src/plugin/query-site/plugin.xml

Modified: incubator/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/CHANGES.txt?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/CHANGES.txt (original)
+++ incubator/nutch/trunk/CHANGES.txt Wed Mar 23 14:16:29 2005
@@ -30,6 +30,16 @@
     redistribution by Apache.  Disabled compilation of plugins which
     require these libraries.  (Doug Cutting 20050301)
 
+ 6. Index host and title in separate fields.  Host was indexed
+    previously only as a part of the URL.  Title was indexed as an
+    anchor.  Now boosts for matching these fields may be adjusted
+    separately from boosts for matching anchors and url.  Also: move
+    site indexing to index-basic plugin to minimize the number of
+    times the URL needs to be parsed; and, stop using anchor analyzer
+    for anything but anchors.  (Piotr Kosiorowski via Doug Cutting
+    20050323)
+
+
 Release 0.6
 
  1. Added clustering-carrot2 plugin, together with introduction of clustering

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Wed Mar 23 14:16:29 2005
@@ -77,7 +77,7 @@
   /** Returns a new token stream for text from the named field. */
   public TokenStream tokenStream(String fieldName, Reader reader) {
     Analyzer analyzer;
-    if ("url".equals(fieldName) || ("anchor".equals(fieldName)))
+    if ("anchor".equals(fieldName))
       analyzer = ANCHOR_ANALYZER;
     else
       analyzer = CONTENT_ANALYZER;

Modified: incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ incubator/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Wed Mar 23 14:16:29 2005
@@ -27,6 +27,8 @@
 import org.apache.nutch.fetcher.FetcherOutput;
 import org.apache.nutch.pagedb.FetchListEntry;
 
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
@@ -43,6 +45,21 @@
     throws IndexingException {
     
     String url = fo.getUrl().toString();
+    String host = null;
+    try {
+      URL u = new URL(url);
+      host = u.getHost();
+    } catch (MalformedURLException e) {
+      throw new IndexingException(e);
+    }
+
+    if (host != null) {
+      // add host as un-stored, indexed and tokenized
+      doc.add(Field.UnStored("host", host));
+      // add site as un-stored, indexed and un-tokenized
+      doc.add(new Field("site", host, false, true, false));
+    }
+
 
     // url is both stored and indexed, so it's both searchable and returned
     doc.add(Field.Text("url", url));
@@ -61,10 +78,8 @@
     if (title.length() > MAX_TITLE_LENGTH) {      // truncate title if needed
       title = title.substring(0, MAX_TITLE_LENGTH);
     }
-    // add title as anchor so it is searchable.  doesn't warrant its own field.
-    doc.add(Field.UnStored("anchor", title));
-    // add title unindexed, so that it can be displayed
-    doc.add(Field.UnIndexed("title", title));
+    // add title indexed and stored so that it can be displayed
+    doc.add(Field.Text("title", title));
 
     return doc;
   }

Modified: incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java (original)
+++ incubator/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Wed Mar 23 14:16:29 2005
@@ -36,15 +36,22 @@
 
   private static float URL_BOOST = 4.0f;
   private static float ANCHOR_BOOST = 2.0f;
+  private static float TITLE_BOOST = 1.5f;
+  private static float HOST_BOOST = 2.0f;
 
   private static int SLOP = Integer.MAX_VALUE;
   private static float PHRASE_BOOST = 1.0f;
 
-  private static final String[] FIELDS = {"url", "anchor", "content"};
-  private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST, 1.0f};
+  private static final String[] FIELDS =
+  { "url", "anchor", "content", "title", "host" };
 
-  /** Set the boost factor for url matches, relative to content and anchor
-   * matches */
+  private static final float[] FIELD_BOOSTS =
+  { URL_BOOST, ANCHOR_BOOST, 1.0f, TITLE_BOOST, HOST_BOOST };
+
+  /**
+   * Set the boost factor for url matches, relative to content and anchor
+   * matches
+   */
   public static void setUrlBoost(float boost) { URL_BOOST = boost; }
 
   /** Set the boost factor for title/anchor matches, relative to url and

Modified: incubator/nutch/trunk/src/plugin/query-site/plugin.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/query-site/plugin.xml?view=diff&r1=158844&r2=158845
==============================================================================
--- incubator/nutch/trunk/src/plugin/query-site/plugin.xml (original)
+++ incubator/nutch/trunk/src/plugin/query-site/plugin.xml Wed Mar 23 14:16:29 2005
@@ -6,10 +6,6 @@
    provider-name="nutch.org">
 
    <extension-point
-      id="org.apache.nutch.indexer.IndexingFilter"
-      name="Nutch Indexing Filter"/>
-
-   <extension-point
       id="org.apache.nutch.searcher.QueryFilter"
       name="Nutch Query Filter"/>
 
@@ -18,14 +14,6 @@
          <export name="*"/>
       </library>
    </runtime>
-
-   <extension id="org.apache.nutch.search.site.SiteIndexingFilter"
-              name="Nutch Site Indexing Filter"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="SiteIndexingFilter"
-                      class="org.apache.nutch.searcher.site.SiteIndexingFilter"/>
-   </extension>
-
 
    <extension id="org.apache.nutch.searcher.site.SiteQueryFilter"
               name="Nutch Site Query Filter"