You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by sivalingam T <th...@rediffmail.com> on 2004/08/24 15:55:09 UTC
PDF indexing
Hi
I have written one files for PDF Indexing. Here I have written as follows ..
This is my IndexPDF file.
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import java.io.File;
import java.util.Date;
import java.util.Arrays;
class IndexPDF {
private static boolean deleting = false; // true during deletion pass
private static IndexReader reader; // existing index
private static IndexWriter writer; // new index being built
private static TermEnum uidIter; // document id iterator
public static void main(String[] argv) {
try {
String index = "index";
boolean create = false;
File root = null;
String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
if (argv.length == 0) {
System.err.println("Usage: " + usage);
return;
}
for (int i = 0; i < argv.length; i++) {
if (argv[i].equals("-index")) { // parse -index option
index = argv[++i];
} else if (argv[i].equals("-create")) { // parse -create option
create = true;
} else if (i != argv.length-1) {
System.err.println("Usage: " + usage);
return;
} else
root = new File(argv[i]);
}
Date start = new Date();
if (!create) { // delete stale docs
deleting = true;
indexDocs(root, index, create);
}
writer = new IndexWriter(index, new StandardAnalyzer(), create);
writer.maxFieldLength = 1000000;
indexDocs(root, index, create); // add new docs
System.out.println("Optimizing index...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
/* Walk directory hierarchy in uid order, while keeping uid iterator from
/* existing index in sync. Mismatches indicate one of: (a) old documents to
/* be deleted; (b) unchanged documents, to be left alone; or (c) new
/* documents, to be indexed.
*/
private static void indexDocs(File file, String index, boolean create)
throws Exception {
if (!create) { // incrementally update
reader = IndexReader.open(index); // open existing index
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
indexDocs(file);
if (deleting) { // delete rest of stale docs
while (uidIter.term() != null && uidIter.term().field() == "uid") {
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.delete(uidIter.term());
uidIter.next();
}
deleting = false;
}
uidIter.close(); // close uid iterator
reader.close(); // close existing index
} else // don't have exisiting
indexDocs(file);
}
private static void indexDocs(File file) throws Exception
{
if (file.isDirectory())
{ // if a directory
String[] files = file.list(); // list its files
Arrays.sort(files); // sort the files
for (int i = 0; i < files.length; i++)
{ // recursively index them
indexDocs(new File(file, files[i]));
}
}
if ((file.getPath().endsWith(".pdf" )) || (file.getPath().endsWith(".PDF" )))
{
System.out.println( "Indexing PDF document: " + file );
try
{
//Document doc = LucenePDFDocument.getDocument( file );
writer.addDocument(LucenePDFDocument.getDocument( file));
}
catch(Exception e)
{}
}
}
}
when i use the following commands, the exceptions are thrown if anybody know please inform me.
C:\>java org.apache.lucene.demo.IndexPDF -create -index c:\lucene\pdf c:\pdfs\Words.pdf
Indexing PDF document: c:\pdfs\Words.pdf
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/log4j/Cate
gory
at org.pdfbox.searchengine.lucene.LucenePDFDocument.addContent(LucenePDF
Document.java:197)
at org.pdfbox.searchengine.lucene.LucenePDFDocument.getDocument(LucenePD
FDocument.java:118)
at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
at org.apache.lucene.demo.IndexPDF.main(Unknown Source)
Thanks.
With Warm Regards,
Sivalingam.T
Sai Eswar Innovations (P) Ltd,
Chennai-92
Re: PDF indexing
Posted by Ben Litchfield <be...@csh.rit.edu>.
You need to add the log4j.jar to your classpath.
On Tue, 24 Aug 2004, sivalingam T wrote:
> �Hi
I have written one files for PDF Indexing. Here I have written as follows ..
This is my IndexPDF file.
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import java.io.File;
import java.util.Date;
import java.util.Arrays;
class IndexPDF {
private static boolean deleting = false; // true during deletion pass
private static IndexReader reader; // existing index
private static IndexWriter writer; // new index being built
private static TermEnum uidIter; // document id iterator
public static void main(String[] argv) {
try {
String index = "index";
boolean create = false;
File root = null;
String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
if (argv.length == 0) {
System.err.println("Usage: " + usage);
return;
}
for (int i = 0; i < argv.length; i++) {
if (argv[i].equals("-index")) { // parse -index option
index = argv[++i];
} else if (argv[i].equals("-create")) { // parse -create option
create = true;
} else if (i != argv.length-1) {
System.err.println("Usage: " + usage);
return;
} else
root = new File(argv[i]);
}
Date start = new Date();
if (!create) { // delete stale docs
deleting = true;
indexDocs(root, index, create);
}
writer = new IndexWriter(index, new StandardAnalyzer(), create);
writer.maxFieldLength = 1000000;
indexDocs(root, index, create); // add new docs
System.out.println("Optimizing index...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
/* Walk directory hierarchy in uid order, while keeping uid iterator from
/* existing index in sync. Mismatches indicate one of: (a) old documents to
/* be deleted; (b) unchanged documents, to be left alone; or (c) new
/* documents, to be indexed.
*/
private static void indexDocs(File file, String index, boolean create)
throws Exception {
if (!create) { // incrementally update
reader = IndexReader.open(index); // open existing index
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
indexDocs(file);
if (deleting) { // delete rest of stale docs
while (uidIter.term() != null && uidIter.term().field() == "uid") {
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.delete(uidIter.term());
uidIter.next();
}
deleting = false;
}
uidIter.close(); // close uid iterator
reader.close(); // close existing index
} else // don't have exisiting
indexDocs(file);
}
private static void indexDocs(File file) throws Exception
{
if (file.isDirectory())
{ // if a directory
String[] files = file.list(); // list its files
Arrays.sort(files); // sort the files
for (int i = 0; i < files.length; i++)
{ // recursively index them
indexDocs(new File(file, files[i]));
}
}
if ((file.getPath().endsWith(".pdf" )) || (file.getPath().endsWith(".PDF" )))
{
System.out.println( "Indexing PDF document: " + file );
try
{
//Document doc = LucenePDFDocument.getDocument( file );
writer.addDocument(LucenePDFDocument.getDocument( file));
}
catch(Exception e)
{}
}
}
}
when i use the following commands, the exceptions are thrown if anybody know please inform me.
C:\>java org.apache.lucene.demo.IndexPDF -create -index c:\lucene\pdf c:\pdfs\Words.pdf
Indexing PDF document: c:\pdfs\Words.pdf
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/log4j/Cate
gory
at org.pdfbox.searchengine.lucene.LucenePDFDocument.addContent(LucenePDF
Document.java:197)
at org.pdfbox.searchengine.lucene.LucenePDFDocument.getDocument(LucenePD
FDocument.java:118)
at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
at org.apache.lucene.demo.IndexPDF.indexDocs(Unknown Source)
at org.apache.lucene.demo.IndexPDF.main(Unknown Source)
Thanks.
With Warm Regards,
> Sivalingam.T
>
> Sai Eswar Innovations (P) Ltd,
> Chennai-92
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-user-help@jakarta.apache.org