You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/09/08 21:45:01 UTC

svn commit: r279605 [3/4] - in /lucene/nutch/branches/mapred: ./ bin/ conf/ docs/ca/ docs/de/ docs/en/ docs/es/ docs/fi/ docs/fr/ docs/hu/ docs/jp/ docs/ms/ docs/nl/ docs/pl/ docs/pt/ docs/sv/ docs/th/ docs/zh/ lib/ site/ src/java/org/apache/nutch/anal...

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Thu Sep  8 12:42:44 2005
@@ -129,7 +129,8 @@
   }
 
   /** The media type of the retrieved content.
-   * @see http://www.iana.org/assignments/media-types/
+   * @see <a href="http://www.iana.org/assignments/media-types/">
+   *      http://www.iana.org/assignments/media-types/</a>
    */
   public String getContentType() {
     ensureInflated();

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolException.java Thu Sep  8 12:42:44 2005
@@ -18,7 +18,6 @@
 
 import java.net.URL;
 
-/** Thrown by {@link Protocol#getContent(String)}.*/
 public class ProtocolException extends Exception {
 
   public ProtocolException() {

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolFactory.java Thu Sep  8 12:42:44 2005
@@ -75,7 +75,7 @@
     
     Extension extension = findExtension(name);
     
-    CACHE.put(name, extension);
+    if (extension != null) CACHE.put(name, extension);
     
     return extension;
   }
@@ -83,7 +83,7 @@
   private static Extension findExtension(String name)
     throws PluginRuntimeException {
 
-    Extension[] extensions = X_POINT.getExtentens();
+    Extension[] extensions = X_POINT.getExtensions();
 
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceGone.java Thu Sep  8 12:42:44 2005
@@ -19,7 +19,7 @@
 import java.io.IOException;
 import java.net.URL;
 
-/** Thrown by {@link Protocol#getContent(String)} when a {@link URL} is invalid.*/
+/** Thrown when a resource is invalid. */
 public class ResourceGone extends ProtocolException {
   private URL url;
 

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ResourceMoved.java Thu Sep  8 12:42:44 2005
@@ -19,8 +19,7 @@
 import java.io.IOException;
 import java.net.URL;
 
-/** Thrown by {@link Protocol#getContent(String)} when a {@link URL} no longer
- * exists.*/
+/** Thrown when a resource no longer exists.*/
 public class ResourceMoved extends IOException {
   private URL oldUrl;
   private URL newUrl;

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/RetryLater.java Thu Sep  8 12:42:44 2005
@@ -19,8 +19,7 @@
 import java.io.IOException;
 import java.net.URL;
 
-/** Thrown by {@link Protocol#getContent(String)} when a {@link URL} should be
- * retried later.*/
+/** Thrown when a resource should be retried later.*/
 public class RetryLater extends ProtocolException {
   private URL url;
 

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java Thu Sep  8 12:42:44 2005
@@ -162,7 +162,7 @@
           continue;
         }
         for (int j = 0; j < segments.length; j++) {
-          LOG.info("Client: segment "+segments[j]+" at "+addr);
+          LOG.finest("Client: segment "+segments[j]+" at "+addr);
           segmentToAddress.put(segments[j], addr);
         }
         liveAddresses.add(addr);

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/Hits.java Thu Sep  8 12:42:44 2005
@@ -44,14 +44,14 @@
   }
 
   /** Returns the total number of hits for this query.  This may be an estimate
-   * when (@link totalIsExact()} is false. */
+   * when (@link #totalIsExact()} is false. */
   public long getTotal() { return total; }
 
-  /** True if {@link getTotal()} gives the exact number of hits, or false if
+  /** True if {@link #getTotal()} gives the exact number of hits, or false if
    * it is only an estimate of the total number of hits. */
   public boolean totalIsExact() { return totalIsExact; }
 
-  /** Set {@link totalIsExact()}. */
+  /** Set {@link #totalIsExact()}. */
   public void setTotalIsExact(boolean isExact) { totalIsExact = isExact; }
 
   /** Returns the number of hits included in this current listing. */

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Thu Sep  8 12:42:44 2005
@@ -17,7 +17,6 @@
 package org.apache.nutch.searcher;
 
 import java.io.IOException;
-import java.net.URL;
 import java.net.URLEncoder;
 import java.util.logging.Level;
 import java.util.Map;
@@ -38,12 +37,6 @@
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
-import org.apache.nutch.html.Entities;
-import org.apache.nutch.searcher.*;
-import org.apache.nutch.plugin.*;
-import org.apache.nutch.clustering.*;
-import org.apache.nutch.util.NutchConf;
-
 
 /** Present search results using A9's OpenSearch extensions to RSS, plus a few
  * Nutch-specific extensions. */   
@@ -74,7 +67,7 @@
   public void doGet(HttpServletRequest request, HttpServletResponse response)
     throws ServletException, IOException {
 
-    bean.LOG.info("query request from " + request.getRemoteAddr());
+    NutchBean.LOG.info("query request from " + request.getRemoteAddr());
 
     // get parameters from request
     request.setCharacterEncoding("UTF-8");
@@ -122,7 +115,7 @@
         (dedupField == null ? "" : "&dedupField=" + dedupField));
 
     Query query = Query.parse(queryString);
-    bean.LOG.info("query: " + queryString);
+    NutchBean.LOG.info("query: " + queryString);
 
     // execute the query
     Hits hits;
@@ -130,11 +123,11 @@
       hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField,
           sort, reverse);
     } catch (IOException e) {
-      bean.LOG.log(Level.WARNING, "Search Error", e);
+      NutchBean.LOG.log(Level.WARNING, "Search Error", e);
       hits = new Hits(0,new Hit[0]);	
     }
 
-    bean.LOG.info("total hits: " + hits.getTotal());
+    NutchBean.LOG.info("total hits: " + hits.getTotal());
 
     // generate xml results
     int end = (int)Math.min(hits.getLength(), start + hitsPerPage);

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/QueryFilters.java Thu Sep  8 12:42:44 2005
@@ -45,7 +45,7 @@
         .getExtensionPoint(QueryFilter.X_POINT_ID);
       if (point == null)
         throw new RuntimeException(QueryFilter.X_POINT_ID+" not found.");
-      Extension[] extensions = point.getExtentens();
+      Extension[] extensions = point.getExtensions();
       CACHE = new QueryFilter[extensions.length];
       for (int i = 0; i < extensions.length; i++) {
         Extension extension = extensions[i];

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java Thu Sep  8 12:42:44 2005
@@ -179,7 +179,8 @@
    * @param withParseText if true, fix parse_text, otherwise ignore it
    * @param withParseData if true, fix parse_data, otherwise ignore it
    * @param dryrun if true, only show what would be done without performing any actions
-   * @return
+   * @return <code>true</code> if segment was fixed successfully, otherwise
+   *         return <code>false</code>.
    */
   public static boolean fixSegment(NutchFileSystem nfs, File dir, 
           boolean withContent, boolean withParseText, boolean withParseData,

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java Thu Sep  8 12:42:44 2005
@@ -69,6 +69,9 @@
     final private static float DECAY_VALUE = 0.85f;
 
     public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.tools.DistributedAnalysisTool");
+    
+    public final static long OUTLINK_LIMIT = 10000;
+    
 
     /**
      * The EditSet inner class represents all of the sorted edits
@@ -343,8 +346,10 @@
         try {
             // Iterate through all items in the webdb, sorted by URL
             long curIndex = 0;
+            long linkCount = 0;
             ScoreValue score = new ScoreValue();
             IWebDBReader reader = new WebDBReader(nfs, dbDir);
+            MD5Hash lastHash = null;
             try {
                 for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); curIndex++) {
                     //
@@ -366,7 +371,25 @@
                     // OK, do some analysis!
                     //
                     Page curPage = (Page) e.nextElement();
+                    
+                    // Process only one page from set of pages having the same
+                    // MD5. Otherwise all links from these pages would be processed
+                    // multiple times.
+                    MD5Hash newHash = curPage.getMD5();
+                    if (newHash.equals(lastHash)) {
+                        continue;
+                    }
+                    lastHash = newHash;
+                    
                     Link outLinks[] = reader.getLinks(curPage.getMD5());
+                    linkCount += outLinks.length;
+                    
+                    if (outLinks.length > OUTLINK_LIMIT) {
+                        LOG.info("Suspicious outlink count = "
+                                + outLinks.length + " for ["
+                                + curPage.getURL().toString() + "].");
+                    }
+                    
                     int targetOutlinkers = 0;
                     for (int i = 0; i < outLinks.length; i++) {
                         if (outLinks[i].targetHasOutlink()) {
@@ -402,7 +425,9 @@
                     }
 
                     if (((curIndex - startIndex) % 5000) == 0) {
-                        LOG.info("Pages consumed: " + (curIndex - startIndex) + " (at index " + curIndex + ")");
+                        LOG.info("Pages consumed: " + (curIndex - startIndex)
+                                + " (at index " + curIndex
+                                + "). Links fetched: " + linkCount + ".");
                     }
                 }
             } finally {

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/Daemon.java Thu Sep  8 12:42:44 2005
@@ -16,7 +16,7 @@
 
 package org.apache.nutch.util;
 
-/** A thread that has called {@link Thread#SetDaemon(boolean) } with true.*/
+/** A thread that has called {@link Thread#setDaemon(boolean) } with true.*/
 public class Daemon extends Thread {
 
   {

Modified: lucene/nutch/branches/mapred/src/plugin/build-plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/build-plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/build-plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/build-plugin.xml Thu Sep  8 12:42:44 2005
@@ -9,7 +9,7 @@
 
   <!-- Load all the default properties, and any the user wants    -->
   <!-- to contribute (without having to type -D or edit this file -->
-  <property file="${user.home}/$(name}.build.properties" />
+  <property file="${user.home}/${name}.build.properties" />
   <property file="${root}/build.properties" />
 
   <property name="nutch.root" location="${root}/../../../"/>
@@ -35,6 +35,8 @@
 
   <property name="build.encoding" value="ISO-8859-1"/>
 
+  <path id="plugin.deps"/>
+
   <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/>
 
   <!-- the normal classpath -->
@@ -45,6 +47,7 @@
     <fileset dir="${nutch.root}/lib">
       <include name="*.jar" />
     </fileset>
+    <path refid="plugin.deps"/>
   </path>
 
   <!-- the unit test classpath -->
@@ -108,7 +111,7 @@
     <mkdir dir="${deploy.dir}"/>
     <copy file="plugin.xml" todir="${deploy.dir}" 
           preservelastmodified="true"/>
-    <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}"/>
+    <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" failonerror="false"/>
     <copy todir="${deploy.dir}" flatten="true">
       <fileset refid="lib.jars"/>
     </copy>

Modified: lucene/nutch/branches/mapred/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/build.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/build.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/build.xml Thu Sep  8 12:42:44 2005
@@ -6,6 +6,8 @@
   <!-- Build & deploy all the plugin jars.                    -->
   <!-- ====================================================== -->
   <target name="deploy">
+     <ant dir="lib-jakarta-poi" target="deploy"/>
+     <ant dir="nutch-extensionpoints" target="deploy"/>
      <ant dir="protocol-file" target="deploy"/>
      <ant dir="protocol-ftp" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
@@ -14,10 +16,13 @@
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-text" target="deploy"/>
      <ant dir="parse-pdf" target="deploy"/>
+     <ant dir="parse-rss" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
+     <ant dir="parse-mspowerpoint" target="deploy"/>
 <!-- <ant dir="parse-mp3" target="deploy"/> -->
 <!-- <ant dir="parse-rtf" target="deploy"/> -->
      <ant dir="parse-ext" target="deploy"/>
+     <ant dir="parse-zip" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
      <ant dir="query-basic" target="deploy"/>
@@ -39,10 +44,13 @@
      <ant dir="protocol-http" target="test"/>
      <ant dir="parse-html" target="test"/>
      <ant dir="parse-pdf" target="test"/>
+     <ant dir="parse-rss" target="test"/>
      <ant dir="parse-msword" target="test"/>
+     <ant dir="parse-mspowerpoint" target="test"/>
  <!-- <ant dir="parse-mp3" target="test"/> -->
  <!-- <ant dir="parse-rtf" target="test"/> -->
      <ant dir="parse-ext" target="test"/>
+     <ant dir="parse-zip" target="test"/>
      <ant dir="creativecommons" target="test"/>
      <ant dir="languageidentifier" target="test"/>
      <ant dir="ontology" target="test"/>
@@ -52,6 +60,8 @@
   <!-- Clean all of the plugins.                              -->
   <!-- ====================================================== -->
   <target name="clean">
+    <ant dir="lib-jakarta-poi" target="clean"/>
+    <ant dir="nutch-extensionpoints" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
@@ -60,10 +70,13 @@
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-text" target="clean"/>
     <ant dir="parse-pdf" target="clean"/>
+    <ant dir="parse-rss" target="clean"/>
     <ant dir="parse-msword" target="clean"/>
+    <ant dir="parse-mspowerpoint" target="clean"/>
     <ant dir="parse-mp3" target="clean"/>
     <ant dir="parse-rtf" target="clean"/>
     <ant dir="parse-ext" target="clean"/>
+    <ant dir="parse-zip" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="query-basic" target="clean"/>
@@ -74,8 +87,8 @@
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="creativecommons" target="clean"/>
     <ant dir="languageidentifier" target="clean"/>
-    <ant dir="clustering-carrot2" target="deploy"/>
-    <ant dir="ontology" target="deploy"/>
+    <ant dir="clustering-carrot2" target="clean"/>
+    <ant dir="ontology" target="clean"/>
   </target>
 
 </project>

Modified: lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/clustering-carrot2/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,10 +5,6 @@
    version="0.9.0"
    provider-name="carrot2.sourceforge.net">
 
-   <extension-point
-      id="org.apache.nutch.clustering.OnlineClusterer"
-      name="Nutch Online Search Results Clustering Plugin"/>
-
    <runtime>
       <library name="clustering-carrot2.jar">
          <export name="*"/>

Modified: lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/creativecommons/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,18 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.HtmlParseFilter"
-      name="HTML Parse Filter"/>
-
-   <extension-point
-      id="org.apache.nutch.indexer.IndexingFilter"
-      name="Nutch Indexing Filter"/>
-
-   <extension-point
-      id="org.apache.nutch.searcher.QueryFilter"
-      name="Nutch Query Filter"/>
-
    <runtime>
       <library name="creativecommons.jar">
          <export name="*"/>

Modified: lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,9 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.indexer.IndexingFilter"
-      name="Nutch Indexing Filter"/>
 
    <runtime>
       <library name="index-basic.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,15 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.indexer.IndexingFilter"
-      name="Nutch Indexing Filter"/>
-
-   <!--
-   <extension-point
-      id="org.apache.nutch.searcher.QueryFilter"
-      name="Nutch Query Filter"/>
-    -->
 
    <runtime>
       <library name="index-more.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Sep  8 12:42:44 2005
@@ -48,11 +48,12 @@
 import java.text.SimpleDateFormat;
 
 import java.util.Date;
+import java.util.Locale;
 import java.util.TimeZone;
 import java.util.Enumeration;
 import java.util.Properties;
 
-
+import org.apache.commons.lang.time.DateUtils;
 /**
  * Add (or reset) a few metaData properties as respective fields
  * (if they are available), so that they can be displayed by more.jsp
@@ -131,15 +132,38 @@
     long time = -1;
     try {
       time = HttpDateFormat.toLong(date);
-    } catch  (ParseException e) {
-      // try to parse it as date in alternative format
-      try {
-        DateFormat df = new SimpleDateFormat("EEE MMM dd HH:mm:ss yyyy zzz");
-        Date d = df.parse(date);
-        time = d.getTime();
-      } catch (Exception e1) {
-        LOG.warning(url+": can't parse erroneous date: "+date);
-      }
+    } catch (ParseException e) {
+	// try to parse it as date in alternative format
+	try {
+	    Date parsedDate = DateUtils.parseDate(date,
+		  new String [] {
+		      "EEE MMM dd HH:mm:ss yyyy",
+		      "EEE MMM dd HH:mm:ss yyyy zzz",
+		      "EEE, MMM dd HH:mm:ss yyyy zzz",
+		      "EEE, dd MMM yyyy HH:mm:ss zzz",
+		      "EEE,dd MMM yyyy HH:mm:ss zzz",
+		      "EEE, dd MMM yyyy HH:mm:sszzz",
+		      "EEE, dd MMM yyyy HH:mm:ss",
+		      "EEE, dd-MMM-yy HH:mm:ss zzz",
+		      "yyyy/MM/dd HH:mm:ss.SSS zzz",
+		      "yyyy/MM/dd HH:mm:ss.SSS",
+		      "yyyy/MM/dd HH:mm:ss zzz",
+		      "yyyy/MM/dd",
+		      "yyyy.MM.dd HH:mm:ss",
+		      "yyyy-MM-dd HH:mm",
+		      "MMM dd yyyy HH:mm:ss. zzz",
+		      "MMM dd yyyy HH:mm:ss zzz",
+		      "dd.MM.yyyy HH:mm:ss zzz",
+		      "dd MM yyyy HH:mm:ss zzz",
+		      "dd.MM.yyyy; HH:mm:ss",
+		      "dd.MM.yyyy HH:mm:ss",
+		      "dd.MM.yyyy zzz"
+		  });
+	    time = parsedDate.getTime();
+	    //	    LOG.warning(url + ": parsed date: " + date +" to:"+time);
+	} catch (Exception e2) {
+	    LOG.warning(url + ": can't parse erroneous date: " + date);
+	}
     }
     return time;
   }

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,17 +5,7 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.HtmlParseFilter"
-      name="HTML Parse Filter"/>
 
-   <extension-point
-      id="org.apache.nutch.indexer.IndexingFilter"
-      name="Nutch Indexing Filter"/>
-
-   <extension-point
-      id="org.apache.nutch.searcher.QueryFilter"
-      name="Nutch Query Filter"/>
 
     <runtime>
       <library name="language-identifier.jar">
@@ -29,7 +19,7 @@
       <implementation id="LanguageParser"
                       class="org.apache.nutch.analysis.lang.HTMLLanguageParser"/>
    </extension>
-   
+
    <extension id="org.apache.nutch.analysis.lang"
               name="Nutch language identifier filter"
               point="org.apache.nutch.indexer.IndexingFilter">

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Thu Sep  8 12:42:44 2005
@@ -14,23 +14,62 @@
  * limitations under the License.
  */
 package org.apache.nutch.analysis.lang;
+
+// JDK imports
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Nutch imports
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.protocol.Content;
-import org.w3c.dom.*;
-
-import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;
 
-/** Adds metadata identifying language of document if found
+// DOM imports
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+
+/**
+ * Adds metadata identifying language of document if found
  * We could also run statistical analysis here but we'd miss all other formats
  */
 public class HTMLLanguageParser implements HtmlParseFilter {
+  
   public static final String META_LANG_NAME="X-meta-lang";
   public static final Logger LOG = LogFormatter
     .getLogger(HTMLLanguageParser.class.getName());
 
+  /* A static Map of ISO-639 language codes */
+  private static Map LANGUAGES_MAP = new HashMap();
+  static {
+    try {
+      Properties p = new Properties();
+      p.load(HTMLLanguageParser.class
+                               .getResourceAsStream("langmappings.properties"));
+      Enumeration keys = p.keys();
+      while (keys.hasMoreElements()) {
+        String key = (String) keys.nextElement();
+        String[] values = p.getProperty(key).split(",", -1);
+        LANGUAGES_MAP.put(key, key);
+        for (int i=0; i<values.length; i++) {
+          LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
+        }
+      }
+    } catch (Exception e) {
+      LOG.severe(e.toString());
+    }
+  }
+  
+
+  
   /**
    * Scan the HTML document looking at possible indications of content language<br>
    * <li>1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
@@ -39,60 +78,122 @@
    * <br>Only the first occurence of language is stored.
    */
   public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
-    String lang = findLanguage(doc);
+    
+    // Trying to find the document's language
+    LanguageParser parser = new LanguageParser(doc);
+    String lang = parser.getLanguage();
 
     if (lang != null) {
       parse.getData().getMetadata().put(META_LANG_NAME, lang);
     }
-                
     return parse;
   }
-        
-  private String findLanguage(Node node) {
-    String lang = null;
 
-    if (node.getNodeType() == Node.ELEMENT_NODE) {
-                        
-      //lang attribute
-      lang = ((Element) node).getAttribute("lang");
-      if (lang != null && lang.length()>1) {
-        return lang;
-      }
-      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+  static class LanguageParser {
+    
+    private String dublinCore = null;
+    private String htmlAttribute = null;
+    private String httpEquiv = null;
+    private String language = null;
+    
+    LanguageParser(Node node) {
+      parse(node);
+      if (htmlAttribute != null) { language = htmlAttribute; }
+      else if (dublinCore != null) { language = dublinCore; }
+      else {language = httpEquiv; }
+    }
+  
+    String getLanguage() {
+      return language;
+    }
+    
+    void parse(Node node) {
 
-        NamedNodeMap attrs=node.getAttributes();
+      String lang = null;
+      
+      if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+        // Check for the lang HTML attribute
+        if (htmlAttribute == null) {
+          htmlAttribute = parseLanguage(((Element) node).getAttribute("lang"));
+        }
 
-        //dc.language
-        for(int i=0;i<attrs.getLength();i++){
-          Node attrnode=attrs.item(i);
-          if("name".equalsIgnoreCase(attrnode.getNodeName())){
-            if("dc.language".equalsIgnoreCase(attrnode.getNodeValue())){
-              Node valueattr=attrs.getNamedItem("content");
-              lang = (valueattr!=null)?valueattr.getNodeValue():null;
+        // Check for Meta
+        if ("meta".equalsIgnoreCase(node.getNodeName())) {
+          NamedNodeMap attrs = node.getAttributes();
+        
+          // Check for the dc.language Meta
+          if (dublinCore == null) {
+            for (int i=0; i<attrs.getLength(); i++) {
+              Node attrnode = attrs.item(i);
+              if ("name".equalsIgnoreCase(attrnode.getNodeName())) {
+                if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) {
+                  Node valueattr = attrs.getNamedItem("content");
+                  if (valueattr != null) {
+                    dublinCore = parseLanguage(valueattr.getNodeValue());
+                  }
+                }
+              }
             }
           }
-        }
-        
-        //http-equiv content-language
-        for(int i=0;i<attrs.getLength();i++){
-          Node attrnode=attrs.item(i);
-          if("http-equiv".equalsIgnoreCase(attrnode.getNodeName())){
-            if("content-language".equals(attrnode.getNodeValue().toLowerCase())){
-              Node valueattr=attrs.getNamedItem("content");
-              lang = (valueattr!=null)?valueattr.getNodeValue():null;
+
+          // Check for the http-equiv content-language
+          if (httpEquiv == null) {
+            for (int i=0; i<attrs.getLength(); i++){
+              Node attrnode = attrs.item(i);
+              if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
+                if ("content-language".equals(attrnode.getNodeValue().toLowerCase())) {
+                  Node valueattr = attrs.getNamedItem("content");
+                  if (valueattr != null) {
+                    httpEquiv = parseLanguage(valueattr.getNodeValue());
+                  }
+                }
+              }
             }
           }
         }
       }
+      
+      // Recurse
+      NodeList children = node.getChildNodes();
+      for (int i=0; children != null && i<children.getLength(); i++) {
+        parse(children.item(i));
+        if ((dublinCore != null) &&
+            (htmlAttribute != null) &&
+            (httpEquiv != null)) {
+          return;
+        }
+      }
     }
-                
-    //recurse
-    NodeList children = node.getChildNodes();
-    for (int i = 0; children != null && i < children.getLength(); i++) {
-      lang = findLanguage(children.item(i));
-      if(lang != null && lang.length()>1) return lang;
+    
+    /**
+     * Parse a language string and return an ISO 639 primary code,
+     * or <code>null</code> if something wrong occurs, or if no language is found.
+     */
+    final static String parseLanguage(String lang) {
+      
+      if (lang == null) { return null; }
+
+      String code = null;
+      String language = null;
+      
+      // First, split multi-valued values
+      String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);
+      
+      int i = 0;
+      while ((language == null) && (i<langs.length)) {
+        // Then, get the primary code
+        code = langs[i].split("-")[0];
+        code = code.split("_")[0];
+        // Find the ISO 639 code
+        language = (String) LANGUAGES_MAP.get(code.toLowerCase());
+        i++;
+      }
+      
+      return language;
     }
-
-    return lang;
+    
   }
+
+      
 }

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Thu Sep  8 12:42:44 2005
@@ -20,6 +20,7 @@
 import java.io.InputStream;
 import java.io.IOException;
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.util.List;
@@ -48,9 +49,13 @@
 
 
 /**
+ * Identify the language of a content, based on statistical analysis.
+ *
+ * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+ *      Language Codes</a>
  * 
  * @author Sami Siren
- * @author Jerome Charron
+ * @author J&eacute;r&ocirc;me Charron
  */
 public class LanguageIdentifier {
   
@@ -59,8 +64,8 @@
   
   private final static float SCORE_THRESOLD = 0.00F;
 
-  public final static Logger LOG = LogFormatter.getLogger(LanguageIdentifier.class.getName());
-
+  private final static Logger LOG =
+          LogFormatter.getLogger(LanguageIdentifier.class.getName());
   
   private ArrayList languages = new ArrayList();
 
@@ -168,7 +173,8 @@
   }
 
   /**
-   * return handle to singleton instance
+   * Get a LanguageIdentifier instance.
+   * @return the LanguageIdentifier singleton instance.
    */
   public static LanguageIdentifier getInstance() {
     if (identifier == null) {
@@ -182,13 +188,25 @@
   }
 
   /**
-   * main method used for testing
-   * 
-   * @param args
+   * Main method used for command line process.
+   * <br/>Usage is:
+   * <pre>
+   * LanguageIdentifier [-identifyrows filename maxlines]
+   *                    [-identifyfile charset filename]
+   *                    [-identifyfileset charset files]
+   *                    [-identifytext text]
+   *                    [-identifyurl url]
+   * </pre>
+   * @param args arguments.
    */
   public static void main(String args[]) {
 
-    String usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]";
+    String usage = "Usage: LanguageIdentifier "            +
+                      "[-identifyrows filename maxlines] " +
+                      "[-identifyfile charset filename] "  +
+                      "[-identifyfileset charset files] "  +
+                      "[-identifytext text] "              +
+                      "[-identifyurl url]";
     int command = 0;
 
     final int IDFILE = 1;
@@ -199,6 +217,7 @@
 
     Vector fileset = new Vector();
     String filename = "";
+    String charset = "";
     String url = "";
     String text = "";
     int max = 0;
@@ -211,6 +230,7 @@
     for (int i = 0; i < args.length; i++) { // parse command line
       if (args[i].equals("-identifyfile")) {
         command = IDFILE;
+        charset = args[++i];
         filename = args[++i];
       }
 
@@ -233,6 +253,7 @@
 
       if (args[i].equals("-identifyfileset")) {
         command = IDFILESET;
+        charset = args[++i];
         for (i++; i < args.length; i++) {
           File[] files = null;
           File f = new File(args[i]);
@@ -264,7 +285,7 @@
         case IDFILE:
           f = new File(filename);
           fis = new FileInputStream(f);
-          lang = idfr.identify(fis);
+          lang = idfr.identify(fis, charset);
           fis.close();
           break;
 
@@ -302,7 +323,7 @@
               filename = (String) i.next();
               f = new File(filename);
               fis = new FileInputStream(f);
-              lang = idfr.identify(fis);
+              lang = idfr.identify(fis, charset);
               fis.close();
             } catch (Exception e) {
               System.out.println(e);
@@ -349,22 +370,26 @@
   }
 
   /**
-   * Identify language based on submitted content
+   * Identify language of a content.
    * 
-   * @param text to analyze
-   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
-   *         unknown
+   * @param content is the content to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the specified content.
    */
-  public String identify(String text) {
-    return identify(new StringBuffer(text));
+  public String identify(String content) {
+    return identify(new StringBuffer(content));
   }
 
   /**
-   * Identify language based on submitted content
+   * Identify language of a content.
    * 
-   * @param text to analyze
-   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
-   *         unknown
+   * @param content is the content to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the specified content.
    */
   public String identify(StringBuffer content) {
 
@@ -405,26 +430,48 @@
   }
 
   /**
-   * Identify language from inputstream
-   * 
-   * @param is
-   * @return language code
-   * @throws IOException
+   * Identify language from input stream.
+   * This method uses the platform default encoding to read the input stream.
+   * For using a specific encoding, use the
+   * {@link #identify(InputStream, String)} method.
+   *
+   * @param is is the input stream to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the content of the specified input stream.
+   * @throws IOException if something wrong occurs on the input stream.
    */
   public String identify(InputStream is) throws IOException {
+    return identify(is, null);
+  }
+  
+  /**
+   * Identify language from input stream.
+   * 
+   * @param is is the input stream to analyze.
+   * @param charset is the charset to use to read the input stream.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the content of the specified input stream.
+   * @throws IOException if something wrong occurs on the input stream.
+   */
+  public String identify(InputStream is, String charset) throws IOException {
 
-    StringBuffer text = new StringBuffer();
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
     byte[] buffer = new byte[2048];
     int len = 0;
 
     while (((len = is.read(buffer)) != -1) &&
-           ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+           ((analyzeLength == 0) || (out.size() < analyzeLength))) {
       if (analyzeLength != 0) {
-          len = Math.min(len, analyzeLength - text.length());
+          len = Math.min(len, analyzeLength - out.size());
       }
-      text.append(new String(buffer, 0, len));
+      out.write(buffer, 0, len);
     }
-    return identify(text);
+    return identify((charset == null) ? out.toString()
+                                      : out.toString(charset));
   }
 
 }

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties Thu Sep  8 12:42:44 2005
@@ -1,187 +1,188 @@
-aa=aar
-ab=abk
+# Defines some mapping between common erroneous languages codes and
+# the ISO 639 two-letters language codes.
+aa=aar,Afar
+ab=abk,Abkhazian
 ae=ave
-af=afr
+af=afr,Afrikaans
 ak=aka
-am=amh
+am=amh,Amharic
 an=arg
-ar=ara
-as=asm
+ar=ara,Arabic
+as=asm,Assamese
 av=ava
-ay=aym
-az=aze
-ba=bak
-be=bel
-bg=bul
-bh=bih
-bi=bis
+ay=aym,Aymara
+az=aze,Azerbaijani
+ba=bak,Bashkir
+be=bel,Byelorussian
+bg=bul,Bulgarian
+bh=bih,Bihari
+bi=bis,Bislama
 bm=bam
-bn=ben
-bo=tib/bod
-br=bre
+bn=ben,Bengali
+bo=bod,tib,Tibetan
+br=bre,Breton
 bs=bos
-ca=cat
+ca=cat,Catalan
 ce=che
 ch=cha
-co=cos
+co=cos,Corsican
 cr=cre
-cs=cze/ces
+cs=ces,cze,Czech
 cu=chu
 cv=chv
-cy=wel/cym
-da=dan
-de=ger/deu
+cy=cym,wel,Welsh
+da=dan,Danish
+de=deu,ger,German
 dv=div
-dz=dzo
+dz=dzo,Dzongkha
 ee=ewe
-el=gre/ell
-en=eng
-eo=epo
-es=spa
-et=est
-eu=baq/eus
-fa=per/fas
+el=ell,gre,Greek
+en=eng,English
+eo=epo,Esperanto
+es=esl,spa,Spanish
+et=est,Estonian
+eu=baq,eus,Basque
+fa=fas,per,Persian
 ff=ful
-fi=fin
-fj=fij
-fo=fao
-fr=fre/fra
-fy=fry
-ga=gle
+fi=fin,Finnish
+fj=fij,Fijian
+fo=fao,Faroese
+fr=fra,fre,French
+fy=fry,Frisian
+ga=gai,iri,Irish
 gd=gla
-gl=glg
-gn=grn
-gu=guj
+gl=glg,Gallegan
+gn=grn,Guarani
+gu=guj,Gujarati
 gv=glv
-ha=hau
-he=heb
-hi=hin
+ha=hau,Hausa
+he=heb,Hebrew
+hi=hin,Hindi
 ho=hmo
-hr=scr/hrv
+hr=scr,hrv,Croatian
 ht=hat
-hu=hun
-hy=arm/hye
+hu=hun,Hungarian
+hy=arm,hye,Armenian
 hz=her
-ia=ina
-id=ind
+ia=ina,Interlingua
+id=ind,Indonesian
 ie=ile
 ig=ibo
 ii=iii
-ik=ipk
+ik=ipk,Inupiak
 io=ido
-is=ice/isl
-it=ita
-iu=iku
-ja=jpn
-jv=jav
-ka=geo/kat
+is=ice,isl,Icelandic
+it=ita,Italian
+iu=iku,Inuktitut
+ja=jpn,Japanese
+jv=jw,jav,jaw,Javanese
+ka=geo,kat,Georgian
 kg=kon
 ki=kik
-ki=kik
-kj=kua
 kj=kua
-kk=kaz
-kl=kal
-km=khm
-kn=kan
-ko=kor
+kk=kaz,Kazakh
+kl=kal,Greenlandic
+km=khm,Khmer
+kn=kan,Kannada
+ko=kor,Korean
 kr=kau
-ks=kas
-ku=kur
+ks=kas,Kashmiri
+ku=kur,Kurdish
 kv=kom
 kw=cor
-ky=kir
-la=lat
+ky=kir,Kirghiz
+la=lat,Latin
 lb=ltz
 lg=lug
 li=lim
-ln=lin
-lo=lao
-lt=lit
+ln=lin,Lingala
+lo=lao,Lao
+lt=lit,Lithuanian
 lu=lub
-lv=lav
-mg=mlg
+lv=lav,Latvian
+mg=mlg,Malagasy
 mh=mah
-mi=mao/mri
-mk=mac/mkd
-ml=mal
-mn=mon
-mo=mol
-mr=mar
-ms=may/msa
+mi=mao,mri,Maori
+mk=mac,mak,Macedonian
+ml=mal,mlt,Maltese
+mn=mon,Mongolian
+mo=mol,Moldavian
+mr=mar,Marathi
+ms=may,msa,Malay
 mt=mlt
-my=bur/mya
-na=nau
+my=bur,mya,Burmese
+na=nau,Nauru
 nb=nob
 nd=nde
-ne=nep
+ne=nep,Nepali
 ng=ndo
-nl=dut/nld
+nl=dut,nla,Dutch
 nn=nno
-no=nor
+no=nor,Norwegian
 nr=nbl
 nv=nav
 ny=nya
-oc=oci
+oc=oci,Langue d'Oc
 oj=oji
-om=orm
-or=ori
+om=orm,Oromo
+or=ori,Oriya
 os=oss
-pa=pan
+pa=pan,Panjabi
 pi=pli
-pl=pol
-ps=pus
-pt=por
-qu=que
-rm=roh
-rn=run
-ro=rum/ron
-ru=rus
-rw=kin
-sa=san
+pl=pol,Polish
+ps=pus,Pushto
+pt=por,Portuguese
+qu=que,Quechua
+rm=roh,Rhaeto-Romance
+rn=run,Rundi
+ro=ron,rum,Romanian
+ru=rus,Russian
+rw=kin,Kinyarwanda
+sa=san,Sanskrit
 sc=srd
-sd=snd
+sd=snd,Sindhi
 se=sme
-sg=sag
-si=sin
-sk=slo/slk
-sl=slv
-sm=smo
-sn=sna
-so=som
-sq=alb/sqi 
-sr=scc/srp
-ss=ssw
-st=sot
-su=sun
-sv=swe
-sw=swa
-ta=tam
-te=tel
-tg=tgk
-th=tha
-ti=tir
-tk=tuk
-tl=tgl
-tn=tsn
-to=ton
-tr=tur
-ts=tso
-tt=tat
-tw=twi
+sg=sag,Sango
+sh=scr,Serbo-Croatian
+si=sin,Singhalese
+sk=slk,slo,Slovak
+sl=slv,Slovenian
+sm=smo,Samoan
+sn=sna,Shona
+so=som,Somali
+sq=alb,sqi,Albanian
+sr=scc,srp,Serbian
+ss=ssw,Siswant
+st=sot,Sotho
+su=sun,Sudanese
+sv=sve,swe,Swedish,Svenska,Sweden
+sw=swa,Swahili
+ta=tam,Tamil
+te=tel,Telugu
+tg=tgk,Tajik
+th=tha,Thai
+ti=tir,Tigrinya
+tk=tuk,Turkmen
+tl=tgl,Tagalog
+tn=tsn,Tswana
+to=tog,Tonga
+tr=tur,Turkish
+ts=tso,Tsonga
+tt=tat,Tatar
+tw=twi,Twi
 ty=tah
-ug=uig
-uk=ukr
-ur=urd
-uz=uzb
+ug=uig,Uighur
+uk=ukr,Ukrainian
+ur=urd,Urdu
+uz=uzb,Uzbek
 ve=ven
-vi=vie
-vo=vol
+vi=vie,Vietnamese
+vo=vol,Volapk
 wa=wln
-wo=wol
-xh=xho
-yi=yid
-yo=yor
-za=zha
-zh=chi/zho
-zu=zul
+wo=wol,Wolof
+xh=xho,Xhosa
+yi=yidYiddish
+yo=yor,Yoruba
+za=zha,Zhuang
+zh=chi,zho,Chinese
+zu=zul,Zulu

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Thu Sep  8 12:42:44 2005
@@ -15,14 +15,19 @@
  */
 package org.apache.nutch.analysis.lang;
 
+// JDK imports
 import java.util.Properties;
 
+// JUnit imports
 import junit.framework.TestCase;
+
+// Nutch imports
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
 
+
 public class TestHTMLLanguageParser extends TestCase {
 
   private static String URL = "http://foo.bar/";
@@ -61,6 +66,61 @@
 
   }
 
+  /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
+  public void testParseLanguage() {
+    String tests[][] = {
+      { "(SCHEME=ISO.639-1) sv", "sv" },
+      { "(SCHEME=RFC1766) sv-FI", "sv" },
+      { "(SCHEME=Z39.53) SWE", "sv" },
+      { "EN_US, SV, EN, EN_UK", "en" },
+      { "English Swedish", "en" },
+      { "English, swedish", "en" },
+      { "English,Swedish", "en" },
+      { "Other (Svenska)", "sv" },
+      { "SE", "se" },
+      { "SV", "sv" },
+      { "SV charset=iso-8859-1", "sv" },
+      { "SV-FI", "sv" },
+      { "SV; charset=iso-8859-1", "sv" },
+      { "SVE", "sv" },
+      { "SW", "sw" },
+      { "SWE", "sv" },
+      { "SWEDISH", "sv" },
+      { "Sv", "sv" },
+      { "Sve", "sv" },
+      { "Svenska", "sv" },
+      { "Swedish", "sv" },
+      { "Swedish, svenska", "sv" },
+      { "en, sv", "en" },
+      { "sv", "sv" },
+      { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" },
+      { "sv,en", "sv" },
+      { "sv-FI", "sv" },
+      { "sv-SE", "sv" },
+      { "sv-en", "sv" },
+      { "sv-fi", "sv" },
+      { "sv-se", "sv" },
+      { "sv; Content-Language: sv", "sv" },
+      { "sv_SE", "sv" },
+      { "sve", "sv" },
+      { "svenska, swedish, engelska, english", "sv" },
+      { "sw", "sw" },
+      { "swe", "sv" },
+      { "swe.SPR.", "sv" },
+      { "sweden", "sv" },
+      { "swedish", "sv" },
+      { "swedish,", "sv" },
+      { "text/html; charset=sv-SE", "sv" },
+      { "text/html; sv", "sv" },
+      { "torp, stuga, uthyres, bed & breakfast", null }
+    };
+    
+    for (int i=0; i<44; i++) {
+      assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
+    }
+  }
+  
+  
   private Content getContent(String text) {
     Properties p = new Properties();
     p.put("Content-Type", "text/html");
@@ -68,4 +128,5 @@
     Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
     return content;
   }
+
 }

Modified: lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml Thu Sep  8 12:42:44 2005
@@ -6,9 +6,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.ontology.Ontology"
-      name="Ontology Model Loader"/>
 
    <runtime>
       <library name="ontology.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,9 +5,7 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.Parser"
-      name="Nutch Content Parser"/>
+
 
    <runtime>
       <library name="parse-ext.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Thu Sep  8 12:42:44 2005
@@ -23,6 +23,7 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.CommandRunner;
@@ -58,7 +59,7 @@
   // set TYPE_PARAMS_MAP using plugin.xml of this plugin
   static {
     Extension[] extensions = PluginRepository.getInstance()
-      .getExtensionPoint("org.apache.nutch.parse.Parser").getExtentens();
+      .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
 
     String contentType, command, timeoutString;
 
@@ -66,7 +67,7 @@
       Extension extension = extensions[i];
 
       // only look for extensions defined by plugin parse-ext
-      if (!extension.getDiscriptor().getPluginId().equals("parse-ext"))
+      if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
         continue;
 
       contentType = extension.getAttribute("contentType");
@@ -151,7 +152,7 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = new Outlink[0];
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
     Properties metaData = new Properties();

Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,13 +5,7 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.Parser"
-      name="Nutch Content Parser"/>
 
-   <extension-point
-      id="org.apache.nutch.parse.HtmlParseFilter"
-      name="HTML Parse Filter"/>
 
    <runtime>
       <library name="parse-html.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java Thu Sep  8 12:42:44 2005
@@ -42,7 +42,6 @@
  * This class takes SAX events (in addition to some extra events
  * that SAX doesn't handle yet) and adds the result to a document
  * or document fragment.
- * @xsl.usage general
  */
 public class DOMBuilder
         implements ContentHandler, LexicalHandler

Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java Thu Sep  8 12:42:44 2005
@@ -27,7 +27,6 @@
 /**
  * Class used to verify whether the specified <var>ch</var> 
  * conforms to the XML 1.0 definition of whitespace. 
- * @xsl.usage internal
  */
 public class XMLCharacterRecognizer
 {
@@ -90,7 +89,7 @@
   /**
    * Tell if the string is whitespace.
    *
-   * @param buf StringBuffer to check as XML whitespace.
+   * @param s String to check as XML whitespace.
    * @return True if characters in buffer are XML whitespace, false otherwise
    */
   public static boolean isWhiteSpace(String s)

Modified: lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,14 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.Parser"
-      name="Nutch Content Parser"/>
-
-   <extension-point
-      id="org.apache.nutch.parse.HtmlParseFilter"
-      name="HTML Parse Filter"/>
-
    <runtime>
       <library name="parse-js.jar">
          <export name="*"/>
@@ -23,17 +15,17 @@
               name="JS Parser"
               point="org.apache.nutch.parse.Parser">
       <implementation id="JSParser"
-	      class="org.apache.nutch.parse.js.JSParseFilter"
-	      contentType="application/x-javascript"
-	      pathSuffix="js"/>
+         class="org.apache.nutch.parse.js.JSParseFilter"
+         contentType="application/x-javascript"
+         pathSuffix="js"/>
    </extension>
    <extension id="org.apache.nutch.parse.js.JSParseFilter"
               name="Parse JS Filter"
               point="org.apache.nutch.parse.HtmlParseFilter">
       <implementation id="JSParseFilter"
-	      class="org.apache.nutch.parse.js.JSParseFilter"
-	      contentType="application/x-javascript"
-	      pathSuffix=""/>
+         class="org.apache.nutch.parse.js.JSParseFilter"
+         contentType="application/x-javascript"
+         pathSuffix=""/>
    </extension>
 
 </plugin>

Modified: lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml Thu Sep  8 12:42:44 2005
@@ -1,13 +1,25 @@
 <?xml version = '1.0' encoding = 'UTF-8'?>
-<plugin version="1.0.0" provider-name="nutch.org" id="parse-mp3" name="MP3 Parse Plug-in" >
-  <extension-point id="org.apache.nutch.parse.Parser" name="Nutch Content Parser" />
-  <runtime>
-    <library name="parse-mp3.jar" >
-      <export name="*" />
-    </library>
-    <library name="jid3lib-0.5.1.jar" />
-  </runtime>
-  <extension point="org.apache.nutch.parse.Parser" id="org.apache.nutch.parse.mp3" name="MP3Parse" >
-    <implementation class="org.apache.nutch.parse.mp3.MP3Parser" pathSuffix="mp3" id="org.apache.nutch.parse.mp3.MP3Parser" contentType="audio/mpeg" />
-  </extension>
+<plugin
+   version="1.0.0"
+   provider-name="nutch.org"
+   id="parse-mp3"
+   name="MP3 Parse Plug-in">
+
+   <runtime>
+      <library name="parse-mp3.jar">
+         <export name="*"/>
+      </library>
+      <library name="jid3lib-0.5.1.jar"/>
+   </runtime>
+
+   <extension point="org.apache.nutch.parse.Parser"
+              id="org.apache.nutch.parse.mp3"
+              name="MP3Parse">
+
+      <implementation class="org.apache.nutch.parse.mp3.MP3Parser"
+                      pathSuffix="mp3"
+                      id="org.apache.nutch.parse.mp3.MP3Parser"
+                      contentType="audio/mpeg"/>
+   </extension>
+
 </plugin>

Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,16 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.Parser"
-      name="Nutch Content Parser"/>
-
-   <!--
-   <extension-point
-      id="org.apache.nutch.parse.MSWordParseFilter"
-      name="MSWord Parse Filter"/>
-   -->
-
    <runtime>
       <library name="parse-msword.jar">
          <export name="*"/>

Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Thu Sep  8 12:42:44 2005
@@ -24,6 +24,7 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
 import org.apache.nutch.parse.ParseException;
 
 import java.util.Properties;
@@ -117,7 +118,7 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = new Outlink[0];
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
     return new ParseImpl(text, parseData);

Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,15 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.Parser"
-      name="Nutch Content Parser"/>
-
-   <!--
-   <extension-point
-      id="org.apache.nutch.parse.PdfParseFilter"
-      name="PDF Parse Filter"/>
-   -->
 
    <runtime>
       <library name="parse-pdf.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Thu Sep  8 12:42:44 2005
@@ -33,6 +33,7 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
 import org.apache.nutch.parse.ParseException;
 
 import java.text.SimpleDateFormat;
@@ -161,7 +162,7 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = new Outlink[0];
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
     Properties metadata = new Properties();

Modified: lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml Thu Sep  8 12:42:44 2005
@@ -1,13 +1,23 @@
 <?xml version = '1.0' encoding = 'UTF-8'?>
-<plugin version="1.0.0" provider-name="nutch.org" id="parse-rtf" name="RTF Parse Plug-in" >
-  <extension-point id="org.apache.nutch.parse.Parser" name="Nutch Content Parser" />
-  <runtime>
-    <library name="parse-rtf.jar" >
-      <export name="*" />
-    </library>
-    <library name="rtf-parser.jar"/>
-  </runtime>
-  <extension point="org.apache.nutch.parse.Parser" id="org.apache.nutch.parse.rtf" name="RTFParse" >
-    <implementation class="org.apache.nutch.parse.rtf.RTFParseFactory" pathSuffix="rtf" id="org.apache.nutch.parse.rtf.RTFParseFactory" contentType="application/rtf" />
-  </extension>
+<plugin
+   version="1.0.0"
+   provider-name="nutch.org"
+   id="parse-rtf"
+   name="RTF Parse Plug-in">
+
+   <runtime>
+      <library name="parse-rtf.jar">
+         <export name="*"/>
+      </library>
+      <library name="rtf-parser.jar"/>
+   </runtime>
+
+   <extension point="org.apache.nutch.parse.Parser"
+              id="org.apache.nutch.parse.rtf"
+              name="RTFParse">
+      <implementation class="org.apache.nutch.parse.rtf.RTFParseFactory"
+                      pathSuffix="rtf" id="org.apache.nutch.parse.rtf.RTFParseFactory"
+                      contentType="application/rtf"/>
+   </extension>
+
 </plugin>

Modified: lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Thu Sep  8 12:42:44 2005
@@ -59,9 +59,12 @@
       title = "";
     }
 
-    ParseData parseData = new ParseData(title, new Outlink[0], metadata);
+    String text = delegate.getText();
 
-    return new ParseImpl(delegate.getText(), parseData);
+    return new ParseImpl(text, 
+                         new ParseData(title,
+                                       OutlinkExtractor.getOutlinks(text),
+                                       metadata));
   }
 
 

Modified: lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,9 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.parse.Parser"
-      name="Nutch Content Parser"/>
 
    <runtime>
       <library name="parse-text.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Thu Sep  8 12:42:44 2005
@@ -28,7 +28,7 @@
     Properties metadata = new Properties();
     metadata.putAll(content.getMetadata());
 
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
+    //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
 
     String encoding =
       StringUtil.parseCharacterEncoding(content.getContentType());
@@ -45,6 +45,9 @@
       text = new String(content.getContent());    // use default encoding
     }
 
-    return new ParseImpl(text, parseData);
+    return new ParseImpl(text,
+                         new ParseData(ParseStatus.STATUS_SUCCESS, "",
+                                       OutlinkExtractor.getOutlinks(text),
+                                       metadata));
   }
 }

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,9 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.protocol.Protocol"
-      name="Nutch Protocol"/>
 
    <runtime>
       <library name="protocol-file.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,9 +5,7 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.protocol.Protocol"
-      name="Nutch Protocol"/>
+
 
    <runtime>
       <library name="protocol-ftp.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,10 +5,6 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.protocol.Protocol"
-      name="Nutch Protocol"/>
-
    <runtime>
       <library name="protocol-http.jar">
          <export name="*"/>

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,21 +5,16 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.protocol.Protocol"
-      name="Nutch Protocol"/>
-
    <runtime>
       <library name="protocol-httpclient.jar">
          <export name="*"/>
       </library>
       <library name="commons-codec.jar" />
       <library name="commons-httpclient-3.0-rc2.jar" />
-      
    </runtime>
 
    <extension id="org.apache.nutch.protocol.httpclient"
-	   name="HttpProtocol"
+              name="HttpProtocol"
               point="org.apache.nutch.protocol.Protocol">
 
       <implementation id="org.apache.nutch.protocol.httpclient.Http"

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Thu Sep  8 12:42:44 2005
@@ -18,14 +18,14 @@
 import org.apache.commons.httpclient.HttpClientError;
 import org.apache.commons.httpclient.params.HttpConnectionParams;
 import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
-import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import com.sun.net.ssl.SSLContext;
 import com.sun.net.ssl.TrustManager;
 
-public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory {
+public class DummySSLProtocolSocketFactory implements ProtocolSocketFactory {
 
   /** Log object for this class. */
   private static final Log LOG = LogFactory.getLog(DummySSLProtocolSocketFactory.class);
@@ -58,7 +58,7 @@
   }
 
   /**
-   * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int,java.net.InetAddress,int)
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
    */
   public Socket createSocket(String host, int port, InetAddress clientHost, int clientPort) throws IOException,
           UnknownHostException {
@@ -79,8 +79,8 @@
    * 
    * @param host the host name/IP
    * @param port the port on the host
-   * @param clientHost the local host name/IP to bind the socket to
-   * @param clientPort the port on the local machine
+   * @param localAddress the local host name/IP to bind the socket to
+   * @param localPort the port on the local machine
    * @param params {@link HttpConnectionParams Http connection parameters}
    * 
    * @return Socket a new socket
@@ -104,14 +104,14 @@
   }
 
   /**
-   * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int)
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
    */
   public Socket createSocket(String host, int port) throws IOException, UnknownHostException {
     return getSSLContext().getSocketFactory().createSocket(host, port);
   }
 
   /**
-   * @see SecureProtocolSocketFactory#createSocket(java.net.Socket,java.lang.String,int,boolean)
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
    */
   public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException,
           UnknownHostException {

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Thu Sep  8 12:42:44 2005
@@ -9,18 +9,16 @@
 import java.net.UnknownHostException;
 import java.util.HashMap;
 import java.util.LinkedList;
-import java.util.Properties;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-import org.apache.commons.httpclient.Cookie;
 import org.apache.commons.httpclient.Credentials;
 import org.apache.commons.httpclient.HostConfiguration;
 import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.HttpState;
 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
 import org.apache.commons.httpclient.NTCredentials;
-import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
 import org.apache.commons.httpclient.protocol.Protocol;
 import org.apache.nutch.db.Page;
 import org.apache.nutch.pagedb.FetchListEntry;
@@ -47,7 +45,6 @@
   }
 
   static final int BUFFER_SIZE = 8 * 1024;
-  private static final int MAX_REDIRECTS = NutchConf.get().getInt("http.redirect.max", 3);
   private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
   private static HttpClient client;
 
@@ -102,8 +99,6 @@
    */
   private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList();
 
-  private RobotRulesParser robotRules = new RobotRulesParser();
-
   private static InetAddress blockAddr(URL url) throws ProtocolException {
     InetAddress addr;
     try {
@@ -183,7 +178,6 @@
   }
 
   public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
     try {
       return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 1.0f), new String[0]));
     } catch (MalformedURLException mue) {
@@ -196,9 +190,6 @@
     try {
       URL url = new URL(urlString);
 
-      int redirects = 0;
-      HttpAuthentication auth = null;
-      while (true) {
         try {
           if (!RobotRulesParser.isAllowed(url))
                   return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
@@ -210,7 +201,7 @@
         InetAddress addr = blockAddr(url);
         HttpResponse response;
         try {
-          response = new HttpResponse(urlString, url); // make a request
+          response = new HttpResponse(url); // make a request
         } finally {
           unblockAddr(addr);
         }
@@ -255,19 +246,10 @@
         } else if (code == 400) { // bad request, mark as GONE
           LOG.fine("400 Bad request: " + url);
           return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, url));
-        } else if (code == 401) { // requires authorization
+        } else if (code == 401) { // requires authorization, but no valid auth provided.
           LOG.fine("401 Authentication Required");
-          if (redirects == MAX_REDIRECTS)
-                  return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED,
-                          "Too many redirects: " + urlString));
-          Properties p = c.getMetadata();
-          if (p instanceof MultiProperties) {
-            auth = HttpAuthenticationFactory.findAuthentication((MultiProperties) p);
-          } else {
-            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authorization required: "
-                    + urlString));
-          }
-          redirects++;
+          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                  + urlString));
         } else if (code == 404) {
           return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, url));
         } else if (code == 410) { // permanently GONE
@@ -276,7 +258,6 @@
           return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                   + url));
         }
-      }
     } catch (Throwable e) {
       e.printStackTrace();
       return new ProtocolOutput(null, new ProtocolStatus(e));
@@ -371,55 +352,33 @@
     // get a client isntance -- we just need one.
 
     client = new HttpClient(connectionManager);
-    // this is just to add logging, whenever cookies are added.
-    client.setState(new NutchHttpState());
 
     // Set up an HTTPS socket factory that accepts self-signed certs.
     Protocol dummyhttps = new Protocol("https", new DummySSLProtocolSocketFactory(), 443);
     Protocol.registerProtocol("https", dummyhttps);
     
-    // set up the connection manager
-    // hardcoded for now
-
-    connectionManager.setMaxTotalConnections(MAX_THREADS_TOTAL);
-    //if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
-    //  connectionManager.setMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
-    //} else {
-    //  connectionManager.setMaxConnectionsPerHost(MAX_THREADS_TOTAL);
-    //}
-
-    HttpConnectionParams params = connectionManager.getParams();
+    HttpConnectionManagerParams params = connectionManager.getParams();
     params.setConnectionTimeout(TIMEOUT);
     params.setSoTimeout(TIMEOUT);
     params.setSendBufferSize(BUFFER_SIZE);
     params.setReceiveBufferSize(BUFFER_SIZE);
+    params.setMaxTotalConnections(MAX_THREADS_TOTAL);
+    if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
+      params.setDefaultMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
+    } else {
+      params.setDefaultMaxConnectionsPerHost(MAX_THREADS_TOTAL);
+    }
+
     HostConfiguration hostConf = client.getHostConfiguration();
     if (PROXY) {
       hostConf.setProxy(PROXY_HOST, PROXY_PORT);
     }
     if (NTLM_USERNAME.length() > 0) {
       Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, NTLM_HOST, NTLM_DOMAIN);
-      client.getState().setCredentials(null, null, ntCreds);
+      client.getState().setCredentials(new AuthScope(NTLM_HOST, AuthScope.ANY_PORT), ntCreds);
 
       LOG.info("Added NTLM credentials for " + NTLM_USERNAME);
     }
     LOG.info("Configured Client");
   }
-}
-
-class NutchHttpState extends HttpState {
-  public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.net.Http.NutchHttpState");
-  
-  public void addCookie(Cookie cookie) {
-    LOG.fine(" - setting cookie: " + cookie);
-    super.addCookie(cookie);
-  }
-  
-  public void addCookies(Cookie[] cookies) {
-    LOG.fine(" - setting cookies: ");
-    for (int i = 0; i < cookies.length; i++)
-      LOG.fine("   cookie: " + cookies[i]);
-    
-    super.addCookies(cookies);
-  }
-}
+}
\ No newline at end of file

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Thu Sep  8 12:42:44 2005
@@ -4,24 +4,34 @@
 package org.apache.nutch.protocol.httpclient;
 
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
 
 import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpVersion;
 
+import org.apache.commons.httpclient.cookie.CookiePolicy;
 import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.util.Properties;
-import java.util.List;
-import java.util.ListIterator;
 
 /**
  * An HTTP response.
  */
 public class HttpResponse {
+  /** A flag that tells if magic resolution must be performed */
+  private final static boolean MAGIC =
+        NutchConf.get().getBoolean("mime.type.magic", true);
+
+  /** Get the MimeTypes resolver instance. */
+  private final static MimeTypes MIME = 
+        MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
   private String orig;
 
   private String base;
@@ -54,24 +64,40 @@
 
   public Content toContent() {
     String contentType = getHeader("Content-Type");
-    if (contentType == null) contentType = "";
+    if (contentType == null) {
+      MimeType type = null;
+      if (MAGIC) {
+        type = MIME.getMimeType(orig, content);
+      } else {
+        type = MIME.getMimeType(orig);
+      }
+      if (type != null) {
+          contentType = type.getName();
+      } else {
+          contentType = "";
+      }
+    }
     if (content == null) content = EMPTY_CONTENT;
     return new Content(orig, base, content, contentType, headers);
   }
 
-  public HttpResponse(URL url) throws ProtocolException, IOException {
-    this(url.toString(), url);
-  }
-
-  public HttpResponse(String orig, URL url) throws IOException {
-    this.orig = orig;
+  public HttpResponse(URL url) throws IOException {
     this.base = url.toString();
-    GetMethod get = new GetMethod(url.toString());
+    this.orig = url.toString();
+    GetMethod get = new GetMethod(this.orig);
     get.setFollowRedirects(false);
-    get.setStrictMode(false);
     get.setRequestHeader("User-Agent", Http.AGENT_STRING);
-    get.setHttp11(false);
-    get.setMethodRetryHandler(null);
+    HttpMethodParams params = get.getParams();
+    // some servers cannot digest the new protocol
+    params.setVersion(HttpVersion.HTTP_1_0);
+    params.makeLenient();
+    params.setContentCharset("UTF-8");
+    params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
+    params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
+    // XXX (ab) not sure about this... the default is to retry 3 times; if
+    // XXX the request body was sent the method is not retried, so there is
+    // XXX little danger in retrying...
+    // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
     try {
       code = Http.getClient().executeMethod(get);
 
@@ -103,6 +129,7 @@
       }
     } catch (org.apache.commons.httpclient.ProtocolException pe) {
       pe.printStackTrace();
+      get.releaseConnection();
       throw new IOException(pe.toString());
     } finally {
       get.releaseConnection();

Modified: lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,9 +5,7 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.searcher.QueryFilter"
-      name="Nutch Query Filter"/>
+
 
    <runtime>
       <library name="query-basic.jar">

Modified: lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml?rev=279605&r1=279604&r2=279605&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml Thu Sep  8 12:42:44 2005
@@ -5,9 +5,7 @@
    version="1.0.0"
    provider-name="nutch.org">
 
-   <extension-point
-      id="org.apache.nutch.searcher.QueryFilter"
-      name="Nutch Query Filter"/>
+
 
    <runtime>
       <library name="query-more.jar">