You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2007/09/07 12:09:26 UTC
svn commit: r573526 - in /jackrabbit/trunk/jackrabbit-core/src/main: java/org/apache/jackrabbit/core/query/lucene/ resources/org/apache/jackrabbit/core/query/lucene/

Author: mreutegg
Date: Fri Sep  7 03:09:24 2007
New Revision: 573526

URL: http://svn.apache.org/viewvc?rev=573526&view=rev
Log:
JCR-1079: Extend the IndexingConfiguration to allow configuration of reuseable analyzers

Added:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitAnalyzer.java   (with props)
Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
    jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.0.dtd

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java?rev=573526&r1=573525&r2=573526&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java Fri Sep  7 03:09:24 2007
@@ -19,6 +19,7 @@
 import org.apache.jackrabbit.core.state.NodeState;
 import org.apache.jackrabbit.core.query.QueryHandlerContext;
 import org.apache.jackrabbit.name.QName;
+import org.apache.lucene.analysis.Analyzer;
 import org.w3c.dom.Element;
 
 /**
@@ -39,9 +40,10 @@
      *
      * @param config the document element of the configuration DOM.
      * @param context the context of the query handler.
+     * @param namespaceMappings the namespaceMappings.
      * @throws Exception if initialization fails.
      */
-    public void init(Element config, QueryHandlerContext context) throws Exception;
+    public void init(Element config, QueryHandlerContext context, NamespaceMappings namespaceMappings) throws Exception;
 
     /**
      * Returns the configured indexing aggregate rules or <code>null</code> if
@@ -92,4 +94,19 @@
      * @return the boost for the node scope fulltext index field.
      */
     float getNodeBoost(NodeState state);
+    
+    /**
+     * Returns the analyzer configured for the property with this fieldName 
+     * (the string representation ,JCR-style name, of the given <code>QName</code> 
+     * prefixed with <code>FieldNames.FULLTEXT_PREFIX</code>), 
+     * and <code>null</code> if none is configured, or the configured analyzer
+     * cannot be found. If <code>null</code> is returned, the default Analyzer
+     * is used.
+     * 
+     * @param fieldName the string representation ,JCR-style name, of the given <code>QName</code>, 
+     * prefixed with <code>FieldNames.FULLTEXT_PREFIX</code>)
+     * @return the <code>analyzer</code> to use for indexing this property 
+     */
+    Analyzer getPropertyAnalyzer(String fieldName);
+    
 }

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java?rev=573526&r1=573525&r2=573526&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java Fri Sep  7 03:09:24 2007
@@ -38,7 +38,10 @@
 import org.apache.jackrabbit.core.query.QueryHandlerContext;
 import org.apache.jackrabbit.core.value.InternalValue;
 import org.apache.jackrabbit.util.ISO9075;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.commons.collections.iterators.AbstractIteratorDecorator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.CharacterData;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -63,6 +66,11 @@
 public class IndexingConfigurationImpl implements IndexingConfiguration {
 
     /**
+     * The logger instance for this class
+     */
+    private static final Logger log = LoggerFactory.getLogger(IndexingConfigurationImpl.class);
+    
+    /**
      * A namespace resolver for parsing QNames in the configuration.
      */
     private NamespaceResolver nsResolver;
@@ -93,9 +101,14 @@
     private AggregateRule[] aggregateRules;
 
     /**
+     * The configured analyzers for indexing properties.
+     */
+    private Map analyzers = new HashMap();
+    
+    /**
      * {@inheritDoc}
      */
-    public void init(Element config, QueryHandlerContext context) throws Exception {
+    public void init(Element config, QueryHandlerContext context, NamespaceMappings nsMappings) throws Exception {
         ntReg = context.getNodeTypeRegistry();
         ism = context.getItemStateManager();
         NameResolver nameResolver = new ParsingNameResolver(
@@ -126,7 +139,49 @@
             } else if (configNode.getNodeName().equals("aggregate")) {
                 idxAggregates.add(new AggregateRuleImpl(
                         configNode, nsResolver, ism, hmgr));
+            } else if (configNode.getNodeName().equals("analyzers")) {
+                NodeList childNodes = configNode.getChildNodes();
+                for (int j = 0; j < childNodes.getLength(); j++) {
+                    Node analyzerNode = childNodes.item(j);
+                    if (analyzerNode.getNodeName().equals("analyzer")) {
+                        String analyzerClassName = analyzerNode.getAttributes().getNamedItem("class").getNodeValue();
+                        try {
+                        Class clazz = Class.forName(analyzerClassName);
+                            if(clazz == JackrabbitAnalyzer.class) {
+                                log.warn("Not allowed to configure " + JackrabbitAnalyzer.class.getName() +  " for a property. " +
+                                        "Using default analyzer for that property.");
+                            }
+                            else if(Analyzer.class.isAssignableFrom(clazz)){
+                                Analyzer analyzer = (Analyzer)clazz.newInstance();
+                                NodeList propertyChildNodes = analyzerNode.getChildNodes();
+                                for (int k = 0; k < propertyChildNodes.getLength(); k++) {
+                                    Node propertyNode = propertyChildNodes.item(k);
+                                    if (propertyNode.getNodeName().equals("property")) {
+                                        // get property name
+                                        QName propName = NameFormat.parse(getTextContent(propertyNode), nsResolver);
+                                        String fieldName = NameFormat.format(propName, nsMappings);
+                                        // set analyzer for the fulltext property fieldname
+                                        int idx = fieldName.indexOf(':');
+                                        fieldName = fieldName.substring(0, idx + 1)
+                                                    + FieldNames.FULLTEXT_PREFIX + fieldName.substring(idx + 1);;
+                                        Object prevAnalyzer = analyzers.put(fieldName, analyzer);
+                                        if(prevAnalyzer!=null){
+                                            log.warn("Property " + propName.getLocalName() + " has been configured for multiple analyzers. " +
+                                                    " Last configured analyzer is used");
+                                        }
+                                    }
+                                }
+                            } else {
+                                log.warn("org.apache.lucene.analysis.Analyzer is not a superclass of " 
+                                        + analyzerClassName +". Ignoring this configure analyzer" );
+                            }
+                        } catch (ClassNotFoundException e) {
+                            log.warn("Analyzer class not found: " + analyzerClassName, e);
+                        }
+                    }
+                }
             }
+            
         }
         aggregateRules = (AggregateRule[]) idxAggregates.toArray(
                 new AggregateRule[idxAggregates.size()]);
@@ -211,6 +266,25 @@
         return true;
     }
 
+    
+    /**
+     * Returns the analyzer configured for the property with this fieldName 
+     * (the string representation ,JCR-style name, of the given <code>QName</code>
+     * prefixed with <code>FieldNames.FULLTEXT_PREFIX</code>)), 
+     * and <code>null</code> if none is configured, or the configured analyzer
+     * cannot be found. If <code>null</code> is returned, the default Analyzer
+     * is used.
+     * 
+     * @param fieldName the string representation ,JCR-style name, of the given <code>QName</code>
+     * prefixed with <code>FieldNames.FULLTEXT_PREFIX</code>))
+     * @return the <code>analyzer</code> to use for indexing this property 
+     */
+    public Analyzer getPropertyAnalyzer(String fieldName) {
+        if(analyzers.containsKey(fieldName)){
+            return (Analyzer)analyzers.get(fieldName);
+        }
+        return null;
+    }
     //---------------------------------< internal >-----------------------------
 
     /**
@@ -732,4 +806,4 @@
             return false;
         }
     }
-}
\ No newline at end of file
+}

Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitAnalyzer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitAnalyzer.java?rev=573526&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitAnalyzer.java (added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitAnalyzer.java Fri Sep  7 03:09:24 2007
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+import java.io.Reader;
+
+/**
+ * This is the global jackrabbit lucene analyzer. By default, all 
+ * properties are indexed with the <code>StandardAnalyzer(new String[]{})</code>, 
+ * unless in the <SearchIndex> configuration a global analyzer is defined.
+ *
+ * In the indexing configuration, properties can be configured to be 
+ * indexed with a specific analyzer. If configured, this analyzer is used to 
+ * index the text of the property and to parse searchtext for this property. 
+ */
+
+public class JackrabbitAnalyzer  extends Analyzer {
+    
+    /**
+     * The default Jackrabbit analyzer if none is configured in <code><SearchIndex></code>
+     * configuration.  
+     */
+    private Analyzer defaultAnalyzer =  new StandardAnalyzer(new String[]{});
+    
+    /**
+     * The indexing configuration.
+     */
+    private IndexingConfiguration indexingConfig; 
+  
+    /**
+     * A param indexingConfig the indexing configuration.
+     */
+    protected void setIndexingConfig(IndexingConfiguration indexingConfig) {
+        this.indexingConfig = indexingConfig;
+    }
+
+    /**
+     * @param analyzer the default jackrabbit analyzer 
+     */
+    protected void setDefaultAnalyzer(Analyzer analyzer){
+        defaultAnalyzer = analyzer;
+    }
+    
+    /** 
+     * Creates a TokenStream which tokenizes all the text in the provided 
+     * Reader. If the fieldName (property) is configured to have a different
+     * analyzer than the default, this analyzer is used for tokenization
+     */
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+        if(indexingConfig!=null){
+            Analyzer propertyAnalyzer = indexingConfig.getPropertyAnalyzer(fieldName); 
+            if(propertyAnalyzer!=null){
+                return propertyAnalyzer.tokenStream(fieldName, reader);
+            }
+        }
+        return defaultAnalyzer.tokenStream(fieldName, reader);
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?rev=573526&r1=573525&r2=573526&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java Fri Sep  7 03:09:24 2007
@@ -39,7 +39,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.Term;
@@ -152,7 +151,7 @@
     /**
      * The analyzer we use for indexing.
      */
-    private Analyzer analyzer;
+    private JackrabbitAnalyzer analyzer;
 
     /**
      * List of text extractor and text filter class names. The configured
@@ -330,7 +329,7 @@
      * Default constructor.
      */
     public SearchIndex() {
-        this.analyzer = new StandardAnalyzer(new String[]{});
+        this.analyzer = new JackrabbitAnalyzer();
     }
 
     /**
@@ -352,7 +351,6 @@
         }
 
         extractor = createTextExtractor();
-        indexingConfig = createIndexingConfiguration();
         synProvider = createSynonymProvider();
 
         File indexDir = new File(path);
@@ -375,7 +373,10 @@
                         context.getNamespaceRegistry());
             }
         }
-
+        
+        indexingConfig = createIndexingConfiguration(nsMappings);
+        analyzer.setIndexingConfig(indexingConfig);
+        
         index = new MultiIndex(indexDir, this, excludedIDs, nsMappings);
         if (index.numDocs() == 0) {
             index.createInitialIndex(
@@ -793,10 +794,11 @@
     }
 
     /**
+     * @param namespaceMappings The namespace mappings 
      * @return the fulltext indexing configuration or <code>null</code> if there
      *         is no configuration.
      */
-    protected IndexingConfiguration createIndexingConfiguration() {
+    protected IndexingConfiguration createIndexingConfiguration(NamespaceMappings namespaceMappings) {
         Element docElement = getIndexingConfigurationDOM();
         if (docElement == null) {
             return null;
@@ -804,7 +806,7 @@
         try {
             IndexingConfiguration idxCfg = (IndexingConfiguration)
                     indexingConfigurationClass.newInstance();
-            idxCfg.init(docElement, getContext());
+            idxCfg.init(docElement, getContext(), namespaceMappings);
             return idxCfg;
         } catch (Exception e) {
             log.warn("Exception initializing indexing configuration from: " +
@@ -1121,7 +1123,7 @@
     public void setAnalyzer(String analyzerClassName) {
         try {
             Class analyzerClass = Class.forName(analyzerClassName);
-            analyzer = (Analyzer) analyzerClass.newInstance();
+            analyzer.setDefaultAnalyzer((Analyzer) analyzerClass.newInstance());
         } catch (Exception e) {
             log.warn("Invalid Analyzer class: " + analyzerClassName, e);
         }

Modified: jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.0.dtd
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.0.dtd?rev=573526&r1=573525&r2=573526&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.0.dtd (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.0.dtd Fri Sep  7 03:09:24 2007
@@ -63,4 +63,15 @@
 -->
 <!ELEMENT property (#PCDATA)>
 <!ATTLIST property boost CDATA "1.0"
-                   nodeScopeIndex CDATA "true">
\ No newline at end of file
+                   nodeScopeIndex CDATA "true">
+
+<!--
+    An analyzer element with property elements in it defines which analyzer is to 
+    be used for indexing and parsing the full text of this property. If the analyzer
+    class can not be found, the default analyzer is used. The node scope is always 
+    indexed with the default analyzer, so might return different results for search 
+    queries in some rare cases. 
+-->
+<!ELEMENT analyzers (analyzer*)>   
+<!ELEMENT analyzer (property*)>
+<!ATTLIST analyzer class CDATA #REQUIRED>