You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2002/06/21 17:02:51 UTC

cvs commit: jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo IndexFiles.java SearchFiles.java XMLDocumentHandlerDOM.java XMLDocumentHandlerSAX.java

otis        2002/06/21 08:02:51

  Added:       contributions/XML-Indexing-Demo IndexingRequest.xml
               contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo
                        IndexFiles.java SearchFiles.java
                        XMLDocumentHandlerDOM.java
                        XMLDocumentHandlerSAX.java
  Log:
  - Unpacked the .zip file, for easier access to the source code.
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/IndexingRequest.xml
  
  Index: IndexingRequest.xml
  ===================================================================
  <customerInfo>
      <name><![CDATA[Aruna A. Raghavan]]></name>
      <profession><![CDATA[Software Developer]]></profession>
      <addressLine1><![CDATA[6801 West 106th Street]]></addressLine1>
      <addressLine2><![CDATA[#205]]></addressLine2>
      <city><![CDATA[Eagan]]></city>
      <state><![CDATA[MN]]></state>
      <zip><![CDATA[55121]]></zip>
      <country><![CDATA[USA]]></country>
  </customerInfo>
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/IndexFiles.java
  
  Index: IndexFiles.java
  ===================================================================
  package org.apache.lucenesandbox.xmlindexingdemo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  import org.apache.lucene.index.IndexWriter;
  
  import java.io.File;
  import java.util.Date;
  
  class IndexFiles
  {
      public static void main(String[] args)
  	throws Exception
      {
  	try
  	{
  	    Date start = new Date();
  
  	    IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true);
  	    indexDocs(writer, new File(args[0]));
  
  	    writer.optimize();
  	    writer.close();
  
  	    Date end = new Date();
  
  	    System.out.print(end.getTime() - start.getTime());
  	    System.out.println(" total milliseconds");
  
  	}
  	catch (Exception e)
  	{
  	    System.out.println(" caught a " + e.getClass() +
  		"\n with message: " + e.getMessage());
  	    throw e;
  	}
      }
  
      public static void indexDocs(IndexWriter writer, File file)
  	throws Exception
      {
  	if (file.isDirectory())
  	{
  	    String[] files = file.list();
  	    for (int i = 0; i < files.length; i++)
  		indexDocs(writer, new File(file, files[i]));
  	}
  	else
  	{
  	    System.out.println("adding " + file);
  	    XMLDocumentHandlerSAX hdlr = new XMLDocumentHandlerSAX(file);
  	    writer.addDocument(hdlr.getDocument());
  	    // For DOM, use
  	    // XMLDocumentHandlerDOM hdlr = new XMLDocumentHandlerDOM();
  	    // writer.addDocument(hdlr.createXMLDocument(file));
  	}
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/SearchFiles.java
  
  Index: SearchFiles.java
  ===================================================================
  package org.apache.lucenesandbox.xmlindexingdemo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.IOException;
  import java.io.BufferedReader;
  import java.io.InputStreamReader;
  
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.search.Searcher;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.search.Query;
  import org.apache.lucene.search.Hits;
  import org.apache.lucene.queryParser.QueryParser;
  
  class SearchFiles {
    public static void main(String[] args) {
      try {
        Searcher searcher = new IndexSearcher("index");
        Analyzer analyzer = new StandardAnalyzer();
  
        BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
        while (true) {
  	System.out.print("Query: ");
  	String line = in.readLine();
  
  	if (line.length() == -1)
  	  break;
  
  	Query query = QueryParser.parse(line, "name", analyzer);
  	System.out.println("Searching for: " + query.toString("name"));
  
  	Hits hits = searcher.search(query);
  	System.out.println(hits.length() + " total matching documents");
  
  	final int HITS_PER_PAGE = 10;
  	for (int start = 0; start < hits.length(); start += HITS_PER_PAGE)
          {
  	  int end = Math.min(hits.length(), start + HITS_PER_PAGE);
  	  for (int i = start; i < end; i++)
            {
  	    Document doc = hits.doc(i);
  	    String name = doc.get("name");
  	    System.out.println(name);
              System.out.println(doc.get("profession"));
              System.out.println(doc.get("addressLine1"));
              System.out.println(doc.get("addressLine2"));
              System.out.print(doc.get("city"));
              System.out.print(" ");
              System.out.print(doc.get("state"));
              System.out.print(" ");
              System.out.print(doc.get("zip"));
              System.out.println(doc.get("country"));
  
  	  }
  
  	  if (hits.length() > end) {
  	    System.out.print("more (y/n) ? ");
  	    line = in.readLine();
  	    if (line.length() == 0 || line.charAt(0) == 'n')
  	      break;
  	  }
  	}
        }
        searcher.close();
  
      } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() +
  			 "\n with message: " + e.getMessage());
      }
    }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerDOM.java
  
  Index: XMLDocumentHandlerDOM.java
  ===================================================================
  package org.apache.lucenesandbox.xmlindexingdemo;
  
  import org.w3c.dom.*;
  import org.w3c.dom.Node;
  import javax.xml.parsers.*;
  import org.apache.lucene.document.Field;
  
  import java.io.File;
  
  /**
   *
   */
  public class XMLDocumentHandlerDOM
  {
      public org.apache.lucene.document.Document createXMLDocument(File f)
      {
  	org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
  	DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
  	try
          {
  	    DocumentBuilder df = dbf.newDocumentBuilder();
  	    org.w3c.dom.Document d = df.parse(f);
  	    Node root = d.getDocumentElement();
  	    traverseTree(root, document);
  	}
  	catch (Exception e)
          {
  	    System.out.println("error: " + e);
  	    e.printStackTrace();
  	}
  	return document;
      }
  
      static private void traverseTree(Node node, org.apache.lucene.document.Document document)
      {
  	NodeList nl = node.getChildNodes();
  	if (nl.getLength() == 0)
          {
  	    if (node.getNodeType() == Node.TEXT_NODE)
  	    {
  		Node parentNode = node.getParentNode();
  		if (parentNode.getNodeType() == Node.ELEMENT_NODE)
                  {
  		    String parentNodeName = parentNode.getNodeName();
  // 		    String nodeValue = node.getNodeValue();
  // 		    if (parentNodeName.equals("name"))
  // 		    {
  			Node siblingNode = node.getNextSibling();
  			if (siblingNode != null)
                          {
  			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  			    {
  				document.add(Field.Text("name", siblingNode.getNodeValue()));
  			    }
   			}
  // 		    }
  // 		    else if (parentNodeName.equals("profession"))
  // 		    {
  // 			Node siblingNode = node.getNextSibling();
  // 			if (siblingNode != null)
  //                         {
  // 			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  //                             {
  // 				document.add(Field.Text([arentNodeName, siblingNode.getNodeValue()));
  // 			    }
  // 			}
  // 		    }
  // 		    else if (parentNodeName == "addressLine1")
  //                     {
  // 			Node siblingNode = node.getNextSibling();
  // 			if(siblingNode != null)
  // 			{
  // 			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  // 		            {
  // 				document.add(Field.Text("addressLine1", siblingNode.getNodeValue()));
  // 			    }
  // 			}
  // 		    }
  // 		    else if (parentNodeName.equals("addressLine2"))
  // 		    {
  // 			Node siblingNode = node.getNextSibling();
  // 			if (siblingNode != null)
  // 			{
  // 			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  // 			    {
  // 				document.add(Field.Text("addressLine2", siblingNode.getNodeValue()));
  // 			    }
  // 			}
  // 		    }
  // 		    if (parentNodeName.equals("city"))
  // 		    {
  // 			Node siblingNode = node.getNextSibling();
  // 			if (siblingNode != null)
  //                         {
  // 			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  // 			    {
  // 				document.add(Field.Text("city", siblingNode.getNodeValue()));
  // 			    }
  // 			}
  // 		    }
  // 		    else if (parentNodeName.equals("zip"))
  // 		    {
  // 			Node siblingNode = node.getNextSibling();
  // 			if (siblingNode != null)
  // 			{
  // 			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  // 			    {
  // 				document.add(Field.Text("zip", siblingNode.getNodeValue()));
  // 			    }
  // 			}
  // 		    }
  // 		    else if (parentNodeName.equals("state"))
  // 		    {
  // 			Node siblingNode = node.getNextSibling();
  // 			if (siblingNode != null)
  // 			{
  // 			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  // 			    {
  // 				document.add(Field.Text("state", siblingNode.getNodeValue()));
  // 			    }
  // 			}
  // 		    }
  // 		    else if (parentNodeName.equals("country"))
  // 		    {
  // 			Node siblingNode = node.getNextSibling();
  // 			if (siblingNode != null)
  // 			{
  // 			    if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
  // 			    {
  // 				document.add(Field.Text("country", siblingNode.getNodeValue()));
  // 			    }
  // 			}
  // 		    }
  		}
  	    }
          }
          else
          {
  	    for(int i=0; i<nl.getLength(); i++)
              {
  		traverseTree(nl.item(i), document);
  	    }
          }
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerSAX.java
  
  Index: XMLDocumentHandlerSAX.java
  ===================================================================
  package org.apache.lucenesandbox.xmlindexingdemo;
  
  import org.xml.sax.*;
  import org.xml.sax.helpers.*;
  import org.xml.sax.AttributeList;
  import javax.xml.parsers.*;
  
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  
  import java.io.File;
  import java.io.IOException;
  
  public class XMLDocumentHandlerSAX
      extends HandlerBase
  {
      /** A buffer for each XML element */
      private StringBuffer elementBuffer = new StringBuffer();
  
      private Document mDocument;
  
      // constructor
      public XMLDocumentHandlerSAX(File xmlFile)
  	throws ParserConfigurationException, SAXException, IOException
      {
  	SAXParserFactory spf = SAXParserFactory.newInstance();
  
  	SAXParser parser = spf.newSAXParser();
  	parser.parse(xmlFile, this);
      }
  
      // call at document start
      public void startDocument()
      {
  	mDocument = new Document();
      }
  
      // call at element start
      public void startElement(String localName, AttributeList atts)
  	throws SAXException
      {
          elementBuffer.setLength(0);
      }
  
      // call when cdata found
      public void characters(char[] text, int start, int length)
      {
  	elementBuffer.append(text, start, length);
      }
  
      // call at element end
      public void endElement(String localName)
  	throws SAXException
      {
  	    mDocument.add(Field.Text(localName, elementBuffer.toString()));
      }
  
      public Document getDocument()
      {
  	return mDocument;
      }
  }
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>