You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by sivalingam T <th...@rediffmail.com> on 2004/08/24 15:55:09 UTC

PDF indexing

  Hi

I have written one files for PDF Indexing. Here I have written as follows ..  

This is my IndexPDF file.

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;

import org.pdfbox.searchengine.lucene.LucenePDFDocument;

import java.io.File;
import java.util.Date;
import java.util.Arrays;

class IndexPDF {
  private static boolean deleting = false;       // true during deletion pass
  private static IndexReader reader;            // existing index
  private static IndexWriter writer;            // new index being built
  private static TermEnum uidIter;            // document id iterator

  public static void main(String[] argv) {
    try {
      String index = "index";
      boolean create = false;
      File root = null;

      String usage = "IndexHTML [-create] [-index <index>] <root_directory>";

      if (argv.length == 0) {
     System.err.println("Usage: " + usage);
     return;
      }

      for (int i = 0; i < argv.length; i++) {
     if (argv[i].equals("-index")) {            // parse -index option
       index = argv[++i];
     } else if (argv[i].equals("-create")) {       // parse -create option
       create = true;
     } else if (i != argv.length-1) {
       System.err.println("Usage: " + usage);
       return;
     } else
       root = new File(argv[i]);
      }

      Date start = new Date();

      if (!create) {                      // delete stale docs
     deleting = true;
     indexDocs(root, index, create);
      }

      writer = new IndexWriter(index, new StandardAnalyzer(), create);
      writer.maxFieldLength = 1000000;

      indexDocs(root, index, create);            // add new docs

      System.out.println("Optimizing index...");
      writer.optimize();
      writer.close();

      Date end = new Date();

      System.out.print(end.getTime() - start.getTime());
      System.out.println(" total milliseconds");

    } catch (Exception e) {
      System.out.println(" caught a " + e.getClass() +
                "\n with message: " + e.getMessage());
    }
  }

  /* Walk directory hierarchy in uid order, while keeping uid iterator from
  /* existing index in sync.  Mismatches indicate one of: (a) old documents to
  /* be deleted; (b) unchanged documents, to be left alone; or (c) new
  /* documents, to be indexed.
  */

  private static void indexDocs(File file, String index, boolean create)
      throws Exception {
    if (!create) {                      // incrementally update

      reader = IndexReader.open(index);            // open existing index
      uidIter = reader.terms(new Term("uid", "")); // init uid iterator

      indexDocs(file);

      if (deleting) {                      // delete rest of stale docs
     while (uidIter.term() != null && uidIter.term().field() == "uid") {
       System.out.println("deleting " +
                   HTMLDocument.uid2url(uidIter.term().text()));
       reader.delete(uidIter.term());
       uidIter.next();
     }
     deleting = false;
      }

      uidIter.close();                      // close uid iterator
      reader.close();                      // close existing index

    } else                           // don't have exisiting
      indexDocs(file);
  }

  private static void indexDocs(File file) throws Exception
  {
    if (file.isDirectory())
     {                 // if a directory
      String[] files = file.list();            // list its files
      Arrays.sort(files);                 // sort the files
      for (int i = 0; i < files.length; i++)
          {  // recursively index them
     indexDocs(new File(file, files[i]));
          }

    } 
     if ((file.getPath().endsWith(".pdf" )) || (file.getPath().endsWith(".PDF" )))
    {
            System.out.println( "Indexing PDF document: " + file );
            try
               {
               //Document doc = LucenePDFDocument.getDocument( file );
            writer.addDocument(LucenePDFDocument.getDocument( file));
               }
               catch(Exception e)
               {}
    }
      
  }
  
}

when i use the following commands, the exceptions are thrown if anybody know please inform me.


C:\>java org.apache.lucene.demo.IndexPDF -create -index c:\lucene\pdf c:\pdfs\Words.pdf

Indexing PDF document: c:\pdfs\Words.pdf
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/log4j/Cate
gory
        at org.pdfbox.searchengine.lucene.LucenePDFDocument.addContent(LucenePDF
Document.java:197)
        at org.pdfbox.searchengine.lucene.LucenePDFDocument.getDocument(LucenePD
FDocument.java:118)
        at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
        at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
        at org.apache.lucene.demo.IndexPDF.main(Unknown Source)



Thanks.






With Warm Regards,
Sivalingam.T

Sai Eswar Innovations (P) Ltd,
Chennai-92

Re: PDF indexing

Posted by Ben Litchfield <be...@csh.rit.edu>.

You need to add the log4j.jar to your classpath.



On Tue, 24 Aug 2004, sivalingam T wrote:

>  �Hi

I have written one files for PDF Indexing. Here I have written as follows ..

This is my IndexPDF file.

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;

import org.pdfbox.searchengine.lucene.LucenePDFDocument;

import java.io.File;
import java.util.Date;
import java.util.Arrays;

class IndexPDF {
  private static boolean deleting = false;       // true during deletion pass
  private static IndexReader reader;            // existing index
  private static IndexWriter writer;            // new index being built
  private static TermEnum uidIter;            // document id iterator

  public static void main(String[] argv) {
    try {
      String index = "index";
      boolean create = false;
      File root = null;

      String usage = "IndexHTML [-create] [-index <index>] <root_directory>";

      if (argv.length == 0) {
     System.err.println("Usage: " + usage);
     return;
      }

      for (int i = 0; i < argv.length; i++) {
     if (argv[i].equals("-index")) {            // parse -index option
       index = argv[++i];
     } else if (argv[i].equals("-create")) {       // parse -create option
       create = true;
     } else if (i != argv.length-1) {
       System.err.println("Usage: " + usage);
       return;
     } else
       root = new File(argv[i]);
      }

      Date start = new Date();

      if (!create) {                      // delete stale docs
     deleting = true;
     indexDocs(root, index, create);
      }

      writer = new IndexWriter(index, new StandardAnalyzer(), create);
      writer.maxFieldLength = 1000000;

      indexDocs(root, index, create);            // add new docs

      System.out.println("Optimizing index...");
      writer.optimize();
      writer.close();

      Date end = new Date();

      System.out.print(end.getTime() - start.getTime());
      System.out.println(" total milliseconds");

    } catch (Exception e) {
      System.out.println(" caught a " + e.getClass() +
                "\n with message: " + e.getMessage());
    }
  }

  /* Walk directory hierarchy in uid order, while keeping uid iterator from
  /* existing index in sync.  Mismatches indicate one of: (a) old documents to
  /* be deleted; (b) unchanged documents, to be left alone; or (c) new
  /* documents, to be indexed.
  */

  private static void indexDocs(File file, String index, boolean create)
      throws Exception {
    if (!create) {                      // incrementally update

      reader = IndexReader.open(index);            // open existing index
      uidIter = reader.terms(new Term("uid", "")); // init uid iterator

      indexDocs(file);

      if (deleting) {                      // delete rest of stale docs
     while (uidIter.term() != null && uidIter.term().field() == "uid") {
       System.out.println("deleting " +
                   HTMLDocument.uid2url(uidIter.term().text()));
       reader.delete(uidIter.term());
       uidIter.next();
     }
     deleting = false;
      }

      uidIter.close();                      // close uid iterator
      reader.close();                      // close existing index

    } else                           // don't have exisiting
      indexDocs(file);
  }

  private static void indexDocs(File file) throws Exception
  {
    if (file.isDirectory())
     {                 // if a directory
      String[] files = file.list();            // list its files
      Arrays.sort(files);                 // sort the files
      for (int i = 0; i < files.length; i++)
          {  // recursively index them
     indexDocs(new File(file, files[i]));
          }

    }
     if ((file.getPath().endsWith(".pdf" )) || (file.getPath().endsWith(".PDF" )))
    {
            System.out.println( "Indexing PDF document: " + file );
            try
               {
               //Document doc = LucenePDFDocument.getDocument( file );
            writer.addDocument(LucenePDFDocument.getDocument( file));
               }
               catch(Exception e)
               {}
    }

  }

}

when i use the following commands, the exceptions are thrown if anybody know please inform me.


C:\>java org.apache.lucene.demo.IndexPDF -create -index c:\lucene\pdf c:\pdfs\Words.pdf

Indexing PDF document: c:\pdfs\Words.pdf
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/log4j/Cate
gory
        at org.pdfbox.searchengine.lucene.LucenePDFDocument.addContent(LucenePDF
Document.java:197)
        at org.pdfbox.searchengine.lucene.LucenePDFDocument.getDocument(LucenePD
FDocument.java:118)
        at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
        at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
        at org.apache.lucene.demo.IndexPDF.main(Unknown Source)



Thanks.






With Warm Regards,
> Sivalingam.T
>
> Sai Eswar Innovations (P) Ltd,
> Chennai-92

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-user-help@jakarta.apache.org