You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/08/27 00:47:12 UTC

svn commit: r240359 - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/indexer/ plugin/nutch-extensionpoints/

Author: jerome
Date: Fri Aug 26 15:47:04 2005
New Revision: 240359

URL: http://svn.apache.org/viewcvs?rev=240359&view=rev
Log:
Add an analysis extension point

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java   (with props)
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java   (with props)
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
    lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml

Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=240359&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Fri Aug 26 15:47:04 2005
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis;
+
+// JDK imports
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Logger;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.LogFormatter;
+
+
+/**
+ * Creates and caches {@link NutchAnalyzer} plugins.
+ *
+ * @author Jérôme Charron
+ */
+public class AnalyzerFactory {
+
+  public final static Logger LOG =
+          LogFormatter.getLogger(AnalyzerFactory.class.getName());
+
+  private final static ExtensionPoint X_POINT = 
+          PluginRepository.getInstance()
+                          .getExtensionPoint(NutchAnalyzer.X_POINT_ID);
+
+  private final static Map CACHE = new HashMap();
+
+  private final static NutchAnalyzer DEFAULT_ANALYSER = 
+                                            new NutchDocumentAnalyzer();
+  
+  
+  static {
+    if (X_POINT == null) {
+      throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID +
+                                 " not found.");
+    }
+  }
+
+
+  private AnalyzerFactory() {}
+
+  
+  /**
+   * Returns the appropriate {@link Analyser} implementation given a language
+   * code.
+   *
+   * <p>NutchAnalyser extensions should define the attribute "lang". The first
+   * plugin found whose "lang" attribute equals the specified lang parameter is
+   * used. If none match, then the {@link NutchDocumentAnalyzer} is used.
+   */
+  public static NutchAnalyzer get(String lang) {
+
+    NutchAnalyzer analyzer = DEFAULT_ANALYSER;
+    Extension extension = getExtension(lang);
+    if (extension != null) {
+        try {
+            analyzer = (NutchAnalyzer) extension.getExtensionInstance();
+        } catch (PluginRuntimeException pre) {
+            analyzer = DEFAULT_ANALYSER;
+        }
+    }
+    return analyzer;
+  }
+
+  private static Extension getExtension(String lang) {
+
+    Extension extension = (Extension) CACHE.get(lang);
+    if (extension == null) {
+      extension = findExtension(lang);
+      CACHE.put(lang, extension);
+    }
+    return extension;
+  }
+
+  private static Extension findExtension(String lang) {
+
+    if (lang != null) {
+      Extension[] extensions = X_POINT.getExtentens();
+      for (int i=0; i<extensions.length; i++) {
+        if (lang.equals(extensions[i].getAttribute("lang"))) {
+          return extensions[i];
+        }
+      }
+    }
+    return null;
+  }
+
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=240359&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java Fri Aug 26 15:47:04 2005
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis;
+
+// JDK imports
+import java.io.Reader;
+
+// Lucene imports
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+
+
+/** 
+ * Extension point for analysis.
+ * All plugins found which implement this extension point are run
+ * sequentially on the parse.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class NutchAnalyzer extends Analyzer {
+
+  /** The name of the extension point. */
+  final static String X_POINT_ID = NutchAnalyzer.class.getName();
+
+  
+  /**
+   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   */
+  public abstract TokenStream tokenStream(String fieldName, Reader reader);
+
+  
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=240359&r1=240358&r2=240359&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Fri Aug 26 15:47:04 2005
@@ -13,20 +13,25 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.analysis;
 
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+
+// Lucene imports
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
-import java.io.Reader;
-import java.io.IOException;
 
-/** The analyzer used for Nutch documents.  Uses the JavaCC-defined lexical
- * analyzer {@link NutchDocumentTokenizer}, with no stop list.  This keeps it
- * consistent with query parsing. */
-public class NutchDocumentAnalyzer extends Analyzer {
+
+/**
+ * The analyzer used for Nutch documents.
+ * Uses the JavaCC-defined lexical analyzer {@link NutchDocumentTokenizer},
+ * with no stop list.  This keeps it consistent with query parsing.
+ */
+public class NutchDocumentAnalyzer extends NutchAnalyzer {
 
   /** Analyzer used to index textual content. */
   private static class ContentAnalyzer extends Analyzer {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=240359&r1=240358&r2=240359&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Fri Aug 26 15:47:04 2005
@@ -16,25 +16,20 @@
 
 package org.apache.nutch.indexer;
 
-import org.apache.nutch.pagedb.*;
-import org.apache.nutch.linkdb.*;
 import org.apache.nutch.fetcher.*;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.db.*;
-import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.segment.SegmentReader;
 import org.apache.nutch.util.*;
-
-import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 
 import java.util.logging.*;
-import java.util.*;
 import java.io.*;
+import org.apache.nutch.analysis.AnalyzerFactory;
+import org.apache.nutch.analysis.NutchAnalyzer;
 
 /** Creates an index for the output corresponding to a single fetcher run. */
 public class IndexSegment {
@@ -149,7 +144,11 @@
               doc = IndexingFilters.filter(doc, parse, fetcherOutput);
     
               // add the document to the index
-              writer.addDocument(doc);
+              NutchAnalyzer analyzer = AnalyzerFactory.get(doc.get("lang"));
+              LOG.info(" Indexing [" + doc.getField("url").stringValue() +
+                       "] with analyzer " + analyzer + " (" + doc.getField("lang").stringValue() + ")");
+              //LOG.info(" Doc is " + doc);
+              writer.addDocument(doc, analyzer);
               if (count > 0 && count % LOG_STEP == 0) {
                 curTime = System.currentTimeMillis();
                 LOG.info(" Processed " + count + " records (" +

Modified: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=240359&r1=240358&r2=240359&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Fri Aug 26 15:47:04 2005
@@ -40,5 +40,8 @@
       id="org.apache.nutch.net.URLFilter"
       name="Nutch URL Filter"/>
 
+<extension-point
+      id="org.apache.nutch.analysis.NutchAnalyzer"
+      name="Nutch Analysis"/>
 
 </plugin>



Re: [Nutch-cvs] svn commit: r240359 - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/indexer/ plugin/nutch-extensionpoints/

Posted by Jérôme Charron <je...@gmail.com>.
> I see several instances of 'analySer' in comments/javadoc and some
> > variables. That should probably be changed to american version - 
> > analyzer, for consistency's sake.
> 
> Corrected/Committed
(http://svn.apache.org/viewcvs.cgi?rev=265020&view=rev)

Regards

Jérôme

-- 
http://motrech.free.fr/
http://www.frutch.org/

Re: [Nutch-cvs] svn commit: r240359 - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/indexer/ plugin/nutch-extensionpoints/

Posted by Jérôme Charron <je...@gmail.com>.
> 
> I see several instances of 'analySer' in comments/javadoc and some
> variables. That should probably be changed to american version -
> analyzer, for consistency's sake.

Yes, that's right.
Thanks.
 Jérôme


-- 
http://motrech.free.fr/
http://www.frutch.org/

Re: [Nutch-cvs] svn commit: r240359 - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/indexer/ plugin/nutch-extensionpoints/

Posted by og...@yahoo.com.
I see several instances of 'analySer' in comments/javadoc and some
variables.  That should probably be changed to american version -
analyzer, for consistency's sake.

Otis


--- jerome@apache.org wrote:

> Author: jerome
> Date: Fri Aug 26 15:47:04 2005
> New Revision: 240359
> 
> URL: http://svn.apache.org/viewcvs?rev=240359&view=rev
> Log:
> Add an analysis extension point
> 
> Added:
>    
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
>   (with props)
>    
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
>   (with props)
> Modified:
>    
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
>    
>
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
>     lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
> 
> Added:
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
> URL:
>
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=240359&view=auto
>
==============================================================================
> ---
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
> (added)
> +++
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
> Fri Aug 26 15:47:04 2005
> @@ -0,0 +1,107 @@
> +/**
> + * Copyright 2005 The Apache Software Foundation
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing,
> software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions
> and
> + * limitations under the License.
> + */
> +package org.apache.nutch.analysis;
> +
> +// JDK imports
> +import java.util.HashMap;
> +import java.util.Map;
> +import java.util.logging.Logger;
> +
> +// Nutch imports
> +import org.apache.nutch.plugin.Extension;
> +import org.apache.nutch.plugin.ExtensionPoint;
> +import org.apache.nutch.plugin.PluginRepository;
> +import org.apache.nutch.plugin.PluginRuntimeException;
> +import org.apache.nutch.util.LogFormatter;
> +
> +
> +/**
> + * Creates and caches {@link NutchAnalyzer} plugins.
> + *
> + * @author J&eacute;r&ocirc;me Charron
> + */
> +public class AnalyzerFactory {
> +
> +  public final static Logger LOG =
> +          LogFormatter.getLogger(AnalyzerFactory.class.getName());
> +
> +  private final static ExtensionPoint X_POINT = 
> +          PluginRepository.getInstance()
> +                         
> .getExtensionPoint(NutchAnalyzer.X_POINT_ID);
> +
> +  private final static Map CACHE = new HashMap();
> +
> +  private final static NutchAnalyzer DEFAULT_ANALYSER = 
> +                                            new
> NutchDocumentAnalyzer();
> +  
> +  
> +  static {
> +    if (X_POINT == null) {
> +      throw new RuntimeException("x point " +
> NutchAnalyzer.X_POINT_ID +
> +                                 " not found.");
> +    }
> +  }
> +
> +
> +  private AnalyzerFactory() {}
> +
> +  
> +  /**
> +   * Returns the appropriate {@link Analyser} implementation given a
> language
> +   * code.
> +   *
> +   * <p>NutchAnalyser extensions should define the attribute "lang".
> The first
> +   * plugin found whose "lang" attribute equals the specified lang
> parameter is
> +   * used. If none match, then the {@link NutchDocumentAnalyzer} is
> used.
> +   */
> +  public static NutchAnalyzer get(String lang) {
> +
> +    NutchAnalyzer analyzer = DEFAULT_ANALYSER;
> +    Extension extension = getExtension(lang);
> +    if (extension != null) {
> +        try {
> +            analyzer = (NutchAnalyzer)
> extension.getExtensionInstance();
> +        } catch (PluginRuntimeException pre) {
> +            analyzer = DEFAULT_ANALYSER;
> +        }
> +    }
> +    return analyzer;
> +  }
> +
> +  private static Extension getExtension(String lang) {
> +
> +    Extension extension = (Extension) CACHE.get(lang);
> +    if (extension == null) {
> +      extension = findExtension(lang);
> +      CACHE.put(lang, extension);
> +    }
> +    return extension;
> +  }
> +
> +  private static Extension findExtension(String lang) {
> +
> +    if (lang != null) {
> +      Extension[] extensions = X_POINT.getExtentens();
> +      for (int i=0; i<extensions.length; i++) {
> +        if (lang.equals(extensions[i].getAttribute("lang"))) {
> +          return extensions[i];
> +        }
> +      }
> +    }
> +    return null;
> +  }
> +
> +}
> 
> Propchange:
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
>
------------------------------------------------------------------------------
>     svn:eol-style = native
> 
> Added:
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
> URL:
>
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java?rev=240359&view=auto
>
==============================================================================
> ---
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
> (added)
> +++
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
> Fri Aug 26 15:47:04 2005
> @@ -0,0 +1,45 @@
> +/**
> + * Copyright 2005 The Apache Software Foundation
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing,
> software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions
> and
> + * limitations under the License.
> + */
> +package org.apache.nutch.analysis;
> +
> +// JDK imports
> +import java.io.Reader;
> +
> +// Lucene imports
> +import org.apache.lucene.analysis.Analyzer;
> +import org.apache.lucene.analysis.TokenStream;
> +
> +
> +/** 
> + * Extension point for analysis.
> + * All plugins found which implement this extension point are run
> + * sequentially on the parse.
> + *
> + * @author J&eacute;r&ocirc;me Charron
> + */
> +public abstract class NutchAnalyzer extends Analyzer {
> +
> +  /** The name of the extension point. */
> +  final static String X_POINT_ID = NutchAnalyzer.class.getName();
> +
> +  
> +  /**
> +   * Creates a TokenStream which tokenizes all the text in the
> provided Reader.
> +   */
> +  public abstract TokenStream tokenStream(String fieldName, Reader
> reader);
> +
> +  
> +}
> 
> Propchange:
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalyzer.java
>
------------------------------------------------------------------------------
>     svn:eol-style = native
> 
> Modified:
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
> URL:
>
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=240359&r1=240358&r2=240359&view=diff
>
==============================================================================
> ---
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
> (original)
> +++
>
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
> Fri Aug 26 15:47:04 2005
> @@ -13,20 +13,25 @@
>   * See the License for the specific language governing permissions
> and
>   * limitations under the License.
>   */
> -
>  package org.apache.nutch.analysis;
>  
> +// JDK imports
> +import java.io.Reader;
> +import java.io.IOException;
> +
> +// Lucene imports
>  import org.apache.lucene.analysis.Analyzer;
>  import org.apache.lucene.analysis.TokenFilter;
>  import org.apache.lucene.analysis.TokenStream;
>  import org.apache.lucene.analysis.Token;
> -import java.io.Reader;
> -import java.io.IOException;
>  
> -/** The analyzer used for Nutch documents.  Uses the JavaCC-defined
> lexical
> - * analyzer {@link NutchDocumentTokenizer}, with no stop list.  This
> keeps it
> - * consistent with query parsing. */
> -public class NutchDocumentAnalyzer extends Analyzer {
> +
> +/**
> + * The analyzer used for Nutch documents.
> + * Uses the JavaCC-defined lexical analyzer {@link
> NutchDocumentTokenizer},
> + * with no stop list.  This keeps it consistent with query parsing.
> + */
> +public class NutchDocumentAnalyzer extends NutchAnalyzer {
>  
>    /** Analyzer used to index textual content. */
>    private static class ContentAnalyzer extends Analyzer {
> 
> Modified:
>
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
> URL:
>
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=240359&r1=240358&r2=240359&view=diff
>
==============================================================================
> ---
>
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
> (original)
> +++
>
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
> Fri Aug 26 15:47:04 2005
> @@ -16,25 +16,20 @@
>  
>  package org.apache.nutch.indexer;
>  
> -import org.apache.nutch.pagedb.*;
> -import org.apache.nutch.linkdb.*;
>  import org.apache.nutch.fetcher.*;
>  import org.apache.nutch.parse.*;
>  import org.apache.nutch.analysis.NutchDocumentAnalyzer;
> -import org.apache.nutch.db.*;
> -import org.apache.nutch.io.*;
>  import org.apache.nutch.fs.*;
>  import org.apache.nutch.segment.SegmentReader;
>  import org.apache.nutch.util.*;
> -
> -import org.apache.lucene.index.IndexReader;
>  import org.apache.lucene.index.IndexWriter;
>  import org.apache.lucene.document.Document;
>  import org.apache.lucene.document.Field;
>  
>  import java.util.logging.*;
> -import java.util.*;
>  import java.io.*;
> +import org.apache.nutch.analysis.AnalyzerFactory;
> +import org.apache.nutch.analysis.NutchAnalyzer;
>  
>  /** Creates an index for the output corresponding to a single
> fetcher run. */
>  public class IndexSegment {
> @@ -149,7 +144,11 @@
>                doc = IndexingFilters.filter(doc, parse,
> fetcherOutput);
>      
>                // add the document to the index
> -              writer.addDocument(doc);
> +              NutchAnalyzer analyzer =
> AnalyzerFactory.get(doc.get("lang"));
> +              LOG.info(" Indexing [" +
> doc.getField("url").stringValue() +
> +                       "] with analyzer " + analyzer + " (" +
> doc.getField("lang").stringValue() + ")");
> +              //LOG.info(" Doc is " + doc);
> +              writer.addDocument(doc, analyzer);
>                if (count > 0 && count % LOG_STEP == 0) {
>                  curTime = System.currentTimeMillis();
>                  LOG.info(" Processed " + count + " records (" +
> 
> Modified:
> lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
> URL:
>
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=240359&r1=240358&r2=240359&view=diff
>
==============================================================================
> --- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
> (original)
> +++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
> Fri Aug 26 15:47:04 2005
> @@ -40,5 +40,8 @@
>        id="org.apache.nutch.net.URLFilter"
>        name="Nutch URL Filter"/>
>  
> +<extension-point
> +      id="org.apache.nutch.analysis.NutchAnalyzer"
> +      name="Nutch Analysis"/>
>  
>  </plugin>
> 
> 
> 
> 
> -------------------------------------------------------
> SF.Net email is Sponsored by the Better Software Conference & EXPO
> September 19-22, 2005 * San Francisco, CA * Development Lifecycle
> Practices
> Agile & Plan-Driven Development * Managing Projects & Teams * Testing
> & QA
> Security * Process Improvement & Measurement *
> http://www.sqe.com/bsce5sf
> _______________________________________________
> Nutch-cvs mailing list
> Nutch-cvs@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/nutch-cvs
>