You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by cm...@apache.org on 2002/06/18 02:44:22 UTC

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage LuceneStorage.java LogStorage.java SQLServerStorage.java StoragePipeline.java

cmarschner    2002/06/17 17:44:22

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/storage
                        LogStorage.java SQLServerStorage.java
                        StoragePipeline.java
  Added:       contributions/webcrawler-LARM/src/de/lanlab/larm/storage
                        LuceneStorage.java
  Log:
  added experimental version of a LuceneStorage
  
  Revision  Changes    Path
  1.4       +2 -2      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java
  
  Index: LogStorage.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- LogStorage.java	1 Jun 2002 18:55:16 -0000	1.3
  +++ LogStorage.java	18 Jun 2002 00:44:22 -0000	1.4
  @@ -201,9 +201,9 @@
       public WebDocument store(WebDocument doc)
       {
           String docInfo = doc.getInfo();
  -        if (logContents && isValid && doc.getDocumentBytes() != null)
  +        if (logContents && isValid && doc.getField("content") != null)
           {
  -            int offset = writeToPageFile(doc.getDocumentBytes());
  +            int offset = writeToPageFile((byte[])doc.getField("content"));
               docInfo = docInfo + "\t" + pageFileCount + "\t" + offset;
           }
           log.logThreadSafe(docInfo);
  
  
  
  1.4       +2 -2      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java
  
  Index: SQLServerStorage.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- SQLServerStorage.java	1 Jun 2002 18:55:16 -0000	1.3
  +++ SQLServerStorage.java	18 Jun 2002 00:44:22 -0000	1.4
  @@ -172,7 +172,7 @@
               conn = getConnection();
               Statement delDoc = conn.createStatement();
   
  -            // bisherige Daten l�schen, indem die Tabelle neu angelegt wird (geht schneller)
  +            // recreate table (faster than delete from table)
   
               delDoc.executeUpdate("if exists (select * from sysobjects where id = object_id(N'[dbo].[Document]') and OBJECTPROPERTY(id, N'IsUserTable') = 1)drop table [dbo].[Document]");
               delDoc.executeUpdate("CREATE TABLE [dbo].[Document] ([DO_ID] [int] IDENTITY (1, 1) NOT NULL ,	[DA_CrawlPass] [int] NULL ,	[DO_URL] [varchar] (255) NULL ,	[DO_ContentType] [varchar] (50) NULL ,	[DO_Data] [text] NULL ,	[DO_Hashcode] [int] NULL ,	[DO_ContentLength] [int] NULL ,	[DO_ContentEncoding] [varchar] (20) NULL ,	[DO_Data2] [image] NULL, [DO_MimeType] [varchar] (255) NULL) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]");       // l�schen
  @@ -206,7 +206,7 @@
               addDoc = getStatement();
               addDoc.setString(1, document.getURLString());
               addDoc.setString(2, document.getMimeType());
  -            addDoc.setBytes(3,  document.getDocumentBytes());
  +            addDoc.setBytes(3,  (byte[])document.getField("content"));
               addDoc.execute();
           }
           catch(SQLException e)
  
  
  
  1.2       +1 -0      jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/StoragePipeline.java
  
  Index: StoragePipeline.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/StoragePipeline.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- StoragePipeline.java	1 Jun 2002 18:55:16 -0000	1.1
  +++ StoragePipeline.java	18 Jun 2002 00:44:22 -0000	1.2
  @@ -95,6 +95,7 @@
       {
           for (Iterator it = docStorages.iterator(); it.hasNext(); )
           {
  +            System.out.println("opening...");
               ((DocumentStorage) it.next()).open();
           }
           isOpen = true;
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LuceneStorage.java
  
  Index: LuceneStorage.java
  ===================================================================
  package de.lanlab.larm.storage;
  
  import de.lanlab.larm.util.WebDocument;
  
  /**
   * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
   * Company:
   *
   * @author
   * @version   1.0
   */
  import org.apache.lucene.index.*;
  import org.apache.lucene.document.*;
  import org.apache.lucene.analysis.*;
  import java.util.*;
  import java.io.*;
  
  /**
   * Description of the Class
   *
   * @author    Administrator
   * @created   14. Juni 2002
   */
  public class LuceneStorage implements DocumentStorage
  {
  
      HashMap fieldInfos = new HashMap();
      IndexWriter writer;
      Analyzer analyzer;
      String indexName;
  
  
      /**
       * Constructor for the LuceneStorage object
       */
      public LuceneStorage() { }
  
  
      /**
       * Sets the analyzer attribute of the LuceneStorage object
       *
       * @param a  The new analyzer value
       */
      public void setAnalyzer(Analyzer a)
      {
          this.analyzer = a;
      }
  
  
      /**
       * Sets the indexName attribute of the LuceneStorage object
       *
       * @param name  The new indexName value
       */
      public void setIndexName(String name)
      {
          this.indexName = name;
      }
  
  
      /**
       * Sets the fieldInfo attribute of the LuceneStorage object
       *
       * @param fieldName  The new fieldInfo value
       * @param value      The new fieldInfo value
       */
      public void setFieldInfo(String fieldName, int value)
      {
          fieldInfos.put(fieldName, new Integer(value));
      }
  
  
      /**
       * Sets the create attribute of the LuceneStorage object
       *
       * @param create  The new create value
       */
      public void setCreate(boolean create)
      {
          this.create = create;
      }
  
      boolean create;
  
  
      /**
       * Description of the Method
       */
      public void open()
      {
          System.out.println("opening Lucene storage with index name " + indexName + ")");
          try
          {
              writer = new IndexWriter(indexName, analyzer, create);
          }
          catch(IOException e)
          {
              System.err.println("IOException occured when opening Lucene Index with index name '" + indexName + "'");
              e.printStackTrace();
          }
          if(writer != null)
          {
              System.out.println("lucene storage opened successfully");
          }
      }
  
  
      public final static int INDEX = 1;
      public final static int STORE = 2;
      public final static int TOKEN = 4;
  
  
      /**
       * Gets the fieldInfo attribute of the LuceneStorage object
       *
       * @param fieldName  Description of the Parameter
       * @param def        Description of the Parameter
       * @return           The fieldInfo value
       */
      protected int getFieldInfo(String fieldName, int def)
      {
          Integer info = (Integer) fieldInfos.get(fieldName);
          if (info != null)
          {
              return info.intValue();
          }
          else
          {
              return def;
          }
      }
  
  
      protected void addField(Document doc, String name, String value, int defaultIndexFlags)
      {
          int flags = getFieldInfo(name, defaultIndexFlags);
          if (flags != 0)
          {
              doc.add(new Field(name, value, (flags & STORE) != 0, (flags & INDEX) != 0, (flags & TOKEN) != 0));
          }
      }
  
      /**
       * Description of the Method
       *
       * @param webDoc  Description of the Parameter
       * @return        Description of the Return Value
       */
      public WebDocument store(WebDocument webDoc)
      {
          //System.out.println("storing " + webDoc.getUrl());
          boolean store;
          boolean index;
          boolean token;
          store = index = token = false;
  
          Document doc = new Document();
          int flags;
  
          addField(doc, "url", webDoc.getUrl().toExternalForm(), STORE | INDEX);
          addField(doc, "mimetype", webDoc.getMimeType(), STORE | INDEX);
          // addField(doc, "...", webDoc.getNormalizedURLString(), STORE | INDEX); and so fortg
          // todo: other fields
          Set fields = webDoc.getFieldNames();
  
          for (Iterator it = fields.iterator(); it.hasNext(); )
          {
              String fieldName = (String) it.next();
              Object field = webDoc.getField(fieldName);
  
              if (field instanceof char[])
              {
                  addField(doc, fieldName, new String((char[]) field), STORE | INDEX);
              }
              else if (field instanceof String)
              {
                  addField(doc, fieldName, (String)field, STORE | INDEX);
              }
              /* else ? */
          }
          try
          {
              writer.addDocument(doc);
          }
          catch(IOException e)
          {
              System.err.println("IOException occured when adding document to Lucene index");
              e.printStackTrace();
          }
          return webDoc;
      }
  
      //public void set
  }
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>