You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by mi...@apache.org on 2003/11/13 23:55:18 UTC

cvs commit: cocoon-lenya/src/java/org/apache/lenya/lucene/index AbstractIndexer.java ConfigurableDocumentCreator.java ConfigurableIndexer.java DefaultIndexer.java Index.java Indexer.java configuration2xslt.xsl

michi       2003/11/13 14:55:18

  Modified:    src/java/org/apache/lenya/lucene/index AbstractIndexer.java
                        ConfigurableDocumentCreator.java
                        ConfigurableIndexer.java DefaultIndexer.java
                        Index.java Indexer.java configuration2xslt.xsl
  Log:
  configurable document creator fixed
  
  Revision  Changes    Path
  1.8       +28 -13    cocoon-lenya/src/java/org/apache/lenya/lucene/index/AbstractIndexer.java
  
  Index: AbstractIndexer.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/lucene/index/AbstractIndexer.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- AbstractIndexer.java	23 Jul 2003 13:21:27 -0000	1.7
  +++ AbstractIndexer.java	13 Nov 2003 22:55:17 -0000	1.8
  @@ -1,5 +1,4 @@
   /*
  -$Id$
   <License>
   
    ============================================================================
  @@ -77,7 +76,9 @@
    * The factory method {@link #getDocumentCreator(String[])} is used to create a
    * DocumentCreator from the command-line arguments.
    *
  - * @author  hrt
  + * @author Andreas Hartmann
  + * @author Michael Wechner
  + * @version $Id$
    */
   public abstract class AbstractIndexer implements Indexer {
       private CommandLineLogger logger = new CommandLineLogger(getClass());
  @@ -101,8 +102,8 @@
       /**
        * Initializes this indexer with command-line parameters.
        */
  -    public void configure(Element element) throws Exception {
  -        documentCreator = createDocumentCreator(element);
  +    public void configure(Element indexer, String configFileName) throws Exception {
  +        documentCreator = createDocumentCreator(indexer, configFileName);
       }
   
       /**
  @@ -114,8 +115,7 @@
        *
        * @throws Exception DOCUMENT ME!
        */
  -    public abstract DocumentCreator createDocumentCreator(Element element)
  -        throws Exception;
  +    public abstract DocumentCreator createDocumentCreator(Element indexer, String configFileName) throws Exception;
   
       /**
        * Updates the index incrementally.
  @@ -191,21 +191,36 @@
        * Returns the filter used to receive the indexable files.
        */
       public FileFilter getFilter() {
  -        return new AbstractIndexer.DefaultIndexFilter();
  +        String[] indexableExtensions = { "html", "htm", "txt" };
  +        return new AbstractIndexer.DefaultIndexFilter(indexableExtensions);
       }
   
       /**
        * FileFilter used to obtain the files to index.
        */
       public class DefaultIndexFilter implements FileFilter {
  -        protected final String[] indexableExtensions = { "html", "htm", "txt" };
  +        protected String[] indexableExtensions;
  +
  +        /**
  +         * Default indexable extensions: html, htm, txt
  +         */
  +        public DefaultIndexFilter() {
  +            String[] iE = { "html", "htm", "txt" };
  +            indexableExtensions = iE;
  +        }
  +
  +        /**
  +         *
  +         */
  +        public DefaultIndexFilter(String[] indexableExtensions) {
  +            this.indexableExtensions = indexableExtensions;
  +        }
   
           /** Tests whether or not the specified abstract pathname should be
            * included in a pathname list.
            *
            * @param  pathname  The abstract pathname to be tested
  -         * @return  <code>true</code> if and only if <code>pathname</code>
  -         *          should be included
  +         * @return  <code>true</code> if and only if <code>pathname</code> should be included
            *
            */
           public boolean accept(File file) {
  
  
  
  1.7       +36 -16    cocoon-lenya/src/java/org/apache/lenya/lucene/index/ConfigurableDocumentCreator.java
  
  Index: ConfigurableDocumentCreator.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/lucene/index/ConfigurableDocumentCreator.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- ConfigurableDocumentCreator.java	23 Jul 2003 13:21:27 -0000	1.6
  +++ ConfigurableDocumentCreator.java	13 Nov 2003 22:55:17 -0000	1.7
  @@ -1,5 +1,4 @@
   /*
  -$Id$
   <License>
   
    ============================================================================
  @@ -64,6 +63,8 @@
   import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;
   
  +import org.apache.log4j.Category;
  +
   import org.w3c.dom.Element;
   import org.w3c.dom.Node;
   import org.w3c.dom.NodeList;
  @@ -81,6 +82,7 @@
   import java.lang.reflect.Method;
   
   import javax.xml.parsers.DocumentBuilder;
  +import javax.xml.parsers.DocumentBuilderFactory;
   import javax.xml.transform.OutputKeys;
   import javax.xml.transform.Transformer;
   import javax.xml.transform.TransformerFactory;
  @@ -90,10 +92,13 @@
   
   
   /**
  - *
  - * @author  hrt
  + * @author Andreas Hartmann
  + * @author Michael Wechner
  + * @version $Id$
    */
   public class ConfigurableDocumentCreator extends AbstractDocumentCreator {
  +    Category log = Category.getInstance(ConfigurableDocumentCreator.class);
  +  
       public static final String LUCENE_NAMESPACE = "http://www.wyona.org/2003/lucene";
       public static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
   
  @@ -118,7 +123,7 @@
       }
   
       /**
  -     * DOCUMENT ME!
  +     * Transform source document into lucene document and generate a Lucene Document instance
        *
        * @param file DOCUMENT ME!
        * @param htdocsDumpDir DOCUMENT ME!
  @@ -127,25 +132,39 @@
        *
        * @throws Exception DOCUMENT ME!
        */
  -    public Document getDocument(File file, File htdocsDumpDir)
  -        throws Exception {
  -        // System.out.println(getClass().getName() + ": indexing " + file.getAbsolutePath());
  +    public Document getDocument(File file, File htdocsDumpDir) throws Exception {
  +        log.debug(".getDocument() : indexing " + file.getAbsolutePath());
           try {
  -            // transform source document into lucene document
  +
  +            org.w3c.dom.Document sourceDocument = null;
  +            DocumentBuilderFactory parserFactory = DocumentBuilderFactory.newInstance();
  +            parserFactory.setValidating(false);
  +            parserFactory.setNamespaceAware(true);
  +            parserFactory.setIgnoringElementContentWhitespace(true);
  +            DocumentBuilder mybuilder = parserFactory.newDocumentBuilder();
  +            sourceDocument = mybuilder.parse(file.getAbsolutePath());
  +
  +
  +// FIXME: What is this good for: <?xml version="1.0"?><body>...</body>
  +/*
               NamespaceHelper documentHelper = new NamespaceHelper(XHTML_NAMESPACE, "xhtml", "html");
               org.w3c.dom.Document sourceDocument = documentHelper.getDocument();
  +
               Element rootNode = sourceDocument.getDocumentElement();
   
               String bodyText = getBodyText(file);
               Element bodyElement = documentHelper.createElement("body", bodyText);
               rootNode.appendChild(bodyElement);
  +*/
  +
  +
  +
   
               DOMSource documentSource = new DOMSource(sourceDocument);
               Writer documentWriter = new StringWriter();
   
               TransformerFactory tFactory = TransformerFactory.newInstance();
  -            Transformer documentTransformer = tFactory.newTransformer(new StreamSource(
  -                        new StringReader(getStylesheet())));
  +            Transformer documentTransformer = tFactory.newTransformer(new StreamSource(new StringReader(getStylesheet())));
               documentTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
               documentTransformer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1");
   
  @@ -158,11 +177,11 @@
               documentTransformer.setParameter("filename", fileName);
               documentTransformer.transform(documentSource, new StreamResult(documentWriter));
   
  -            dumpLuceneDocument(file, documentWriter);
  +            // DEBUG: debug lucene documents
  +            //dumpLuceneDocument(file, documentWriter);
   
               DocumentBuilder builder = DocumentHelper.createBuilder();
  -            org.w3c.dom.Document luceneDocument = builder.parse(new InputSource(
  -                        new StringReader(documentWriter.toString())));
  +            org.w3c.dom.Document luceneDocument = builder.parse(new InputSource(new StringReader(documentWriter.toString())));
   
               NamespaceHelper helper = new NamespaceHelper(LUCENE_NAMESPACE, "luc", luceneDocument);
               Element root = luceneDocument.getDocumentElement();
  @@ -196,9 +215,10 @@
       /**
        * Writes the lucene XML document to a file.
        */
  -    protected static void dumpLuceneDocument(File file, Writer writer)
  -        throws IOException {
  -        File luceneDocumentFile = new File(file.getAbsolutePath() + ".xml");
  +    protected void dumpLuceneDocument(File file, Writer writer) throws IOException {
  +        log.debug(".dumpLuceneDocument(): Dump document: " + file.getAbsolutePath());
  +
  +        File luceneDocumentFile = new File(file.getAbsolutePath() + ".xluc");
           luceneDocumentFile.createNewFile();
   
           FileWriter fileWriter = new FileWriter(luceneDocumentFile);
  
  
  
  1.9       +36 -76    cocoon-lenya/src/java/org/apache/lenya/lucene/index/ConfigurableIndexer.java
  
  Index: ConfigurableIndexer.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/lucene/index/ConfigurableIndexer.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- ConfigurableIndexer.java	23 Jul 2003 13:21:27 -0000	1.8
  +++ ConfigurableIndexer.java	13 Nov 2003 22:55:17 -0000	1.9
  @@ -1,5 +1,4 @@
   /*
  -$Id$
   <License>
   
    ============================================================================
  @@ -57,62 +56,12 @@
   
   import org.apache.lenya.xml.DocumentHelper;
   
  +import org.apache.log4j.Category;
  +
   import org.w3c.dom.Element;
   
  -/* ====================================================================
  - * The Apache Software License, Version 1.1
  - *
  - * Copyright (c) 2001 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache" and "Apache Software Foundation" and
  - *    "Apache Lucene" must not be used to endorse or promote products
  - *    derived from this software without prior written permission. For
  - *    written permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache Lucene", nor may "Apache" appear in their name, without
  - *    prior written permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - */
   import java.io.File;
  +import java.io.FileFilter;
   import java.io.StringWriter;
   import java.io.Writer;
   
  @@ -127,32 +76,33 @@
   
   
   /**
  - * DOCUMENT ME!
  - *
  - * @author $author$
  - * @version $Revision$
  + * @author Andreas Hartmann
  + * @author Michael Wechner
  + * @version $Id$
    */
   public class ConfigurableIndexer extends AbstractIndexer {
  +    Category log = Category.getInstance(ConfigurableIndexer.class);
  +
       /**
  -     * DOCUMENT ME!
  +     * Instantiate a Document Creator for creating Lucene Documents
        *
  -     * @param element DOCUMENT ME!
  +     * @param element <code>indexer</code> node
        *
  -     * @return DOCUMENT ME!
  +     * @return DocumentCreator
        *
        * @throws Exception DOCUMENT ME!
        */
  -    public DocumentCreator createDocumentCreator(Element element)
  -        throws Exception {
  -        // FIXME: ANT has a problem with Avalon, that's why we replaced Configuration by Element
  -
  -        /*
  -                String configurationFileName = configuration.getChild("configuration").getAttribute("src");
  -                File configurationFile = new File(configurationFileName);
  -                String stylesheet = getStylesheet(configurationFile);
  -                return new ConfigurableDocumentCreator(stylesheet);
  -        */
  -        return null;
  +    public DocumentCreator createDocumentCreator(Element indexer, String configFileName) throws Exception {
  +        log.error(".createDocumentCreatort(): Element name: " + indexer.getNodeName());
  +
  +        String luceneDocConfigFileName = "cmfs-luceneDoc.xconf"; // indexer/configuration/@src
  +
  +        String configurationFileName = new File(configFileName).getParent() + File.separator + luceneDocConfigFileName; // configuration.getChild("configuration").getAttribute("src");
  +        File configurationFile = new File(configurationFileName);
  +        String stylesheet = getStylesheet(configurationFile);
  +        return new ConfigurableDocumentCreator(stylesheet);
  +
  +        //return null;
       }
   
       public static final String CONFIGURATION_CREATOR_STYLESHEET = "org/apache/lenya/lucene/index/configuration2xslt.xsl";
  @@ -160,20 +110,30 @@
       /**
        * Converts the configuration file to an XSLT stylesheet and returns a reader that reads this stylesheet.
        */
  -    protected static String getStylesheet(File configurationFile)
  -        throws Exception {
  +    protected String getStylesheet(File configurationFile) throws Exception {
  +        log.error(".getStylesheet(): Configuration file: " + configurationFile.getAbsolutePath());
  +
           URL configurationCreatorURL = ConfigurableIndexer.class.getClassLoader().getResource(CONFIGURATION_CREATOR_STYLESHEET);
           File configurationStylesheetFile = new File(new URI(configurationCreatorURL.toString()));
           org.w3c.dom.Document configurationDocument = DocumentHelper.readDocument(configurationFile);
   
           TransformerFactory tFactory = TransformerFactory.newInstance();
  -        Transformer configurationTransformer = tFactory.newTransformer(new StreamSource(
  -                    configurationStylesheetFile));
  +        Transformer configurationTransformer = tFactory.newTransformer(new StreamSource(configurationStylesheetFile));
   
           DOMSource source = new DOMSource(configurationDocument);
           Writer stylesheetWriter = new StringWriter();
           configurationTransformer.transform(source, new StreamResult(stylesheetWriter));
   
  +        log.debug(".getStylesheet(): Meta Stylesheet: " + stylesheetWriter.toString());
  +
           return stylesheetWriter.toString();
  +    }
  +
  +    /**
  +     * Returns the filter used to receive the indexable files.
  +     */
  +    public FileFilter getFilter() {
  +        String[] indexableExtensions = { "xml" };
  +        return new AbstractIndexer.DefaultIndexFilter(indexableExtensions);
       }
   }
  
  
  
  1.6       +2 -3      cocoon-lenya/src/java/org/apache/lenya/lucene/index/DefaultIndexer.java
  
  Index: DefaultIndexer.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/lucene/index/DefaultIndexer.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- DefaultIndexer.java	23 Jul 2003 13:21:27 -0000	1.5
  +++ DefaultIndexer.java	13 Nov 2003 22:55:17 -0000	1.6
  @@ -76,8 +76,7 @@
        *
        * @throws Exception DOCUMENT ME!
        */
  -    public DocumentCreator createDocumentCreator(Element element)
  -        throws Exception {
  +    public DocumentCreator createDocumentCreator(Element indexer, String configFileName) throws Exception {
           return new DefaultDocumentCreator();
       }
   }
  
  
  
  1.9       +4 -4      cocoon-lenya/src/java/org/apache/lenya/lucene/index/Index.java
  
  Index: Index.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/lucene/index/Index.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- Index.java	23 Jul 2003 13:21:27 -0000	1.8
  +++ Index.java	13 Nov 2003 22:55:17 -0000	1.9
  @@ -110,8 +110,8 @@
   
               if (debug.equalsIgnoreCase("false") || debug.equalsIgnoreCase("no")) {
                   DebugConfiguration.setDebug(false);
  -            } else if (debug.equalsIgnoreCase("false") || debug.equalsIgnoreCase("no")) {
  -                DebugConfiguration.setDebug(false);
  +            } else if (debug.equalsIgnoreCase("true") || debug.equalsIgnoreCase("yes")) {
  +                DebugConfiguration.setDebug(true);
               } else {
                   System.err.println("ERROR: <debug> must be one of 'yes', 'true', 'no', or 'false'");
               }
  @@ -135,7 +135,7 @@
   
               DOMUtil du = new DOMUtil();
               Document config = new DOMParserFactory().getDocument(argv[0]);
  -            indexer.configure(du.getElement(config.getDocumentElement(), new XPath("indexer")));
  +            indexer.configure(du.getElement(config.getDocumentElement(), new XPath("indexer")), argv[0]);
   
               if (create) {
                   indexer.createIndex(root, index);
  
  
  
  1.7       +4 -4      cocoon-lenya/src/java/org/apache/lenya/lucene/index/Indexer.java
  
  Index: Indexer.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/lucene/index/Indexer.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- Indexer.java	23 Jul 2003 13:21:27 -0000	1.6
  +++ Indexer.java	13 Nov 2003 22:55:17 -0000	1.7
  @@ -1,5 +1,4 @@
   /*
  -$Id$
   <License>
   
    ============================================================================
  @@ -61,14 +60,15 @@
   
   
   /**
  - *
  - * @author  hrt
  + * @author Andreas Hartmann
  + * @author Michael Wechner
  + * @version $Id$
    */
   public interface Indexer {
       /**
        * Configures this indexer.
        */
  -    void configure(Element element) throws Exception;
  +    void configure(Element indexer, String configFileName) throws Exception;
   
       /**
        * Indexes the contents of a directory.
  
  
  
  1.2       +4 -4      cocoon-lenya/src/java/org/apache/lenya/lucene/index/configuration2xslt.xsl
  
  Index: configuration2xslt.xsl
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/lucene/index/configuration2xslt.xsl,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- configuration2xslt.xsl	22 Mar 2003 21:52:26 -0000	1.1
  +++ configuration2xslt.xsl	13 Nov 2003 22:55:17 -0000	1.2
  @@ -2,10 +2,10 @@
   
   <!--
       Document   : configuration2xslt.xsl
  -    Created on : 17. M�rz 2003, 15:03
  -    Author     : hrt
  -    Description:
  -        Purpose of transformation follows.
  +    Created on : March 17, 2003, 15:03
  +    Author     : Andreas Hartmann
  +    Author     : Michael Wechner
  +    Description: Generates meta stylesheet
   -->
   
   <xsl:stylesheet version="1.0"
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lenya-cvs-unsubscribe@cocoon.apache.org
For additional commands, e-mail: lenya-cvs-help@cocoon.apache.org