You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2013/10/08 19:15:10 UTC
svn commit: r1530357 - in /pdfbox/trunk: ./ examples/
examples/src/main/java/org/apache/pdfbox/examples/lucene/ lucene/ lucene/src/
Author: lehmi
Date: Tue Oct 8 17:15:09 2013
New Revision: 1530357
URL: http://svn.apache.org/r1530357
Log:
PDFBOX-1356: moved the lucene stuff to the examples subproject, remove the lucene subproject
Added:
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java (with props)
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java
- copied, changed from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html
- copied, changed from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html
Removed:
pdfbox/trunk/lucene/pom.xml
pdfbox/trunk/lucene/src/
Modified:
pdfbox/trunk/examples/pom.xml
pdfbox/trunk/pom.xml
Modified: pdfbox/trunk/examples/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/pom.xml?rev=1530357&r1=1530356&r2=1530357&view=diff
==============================================================================
--- pdfbox/trunk/examples/pom.xml (original)
+++ pdfbox/trunk/examples/pom.xml Tue Oct 8 17:15:09 2013
@@ -37,6 +37,10 @@
</description>
<inceptionYear>2002</inceptionYear>
+ <properties>
+ <lucene.version>4.3.1</lucene.version>
+ </properties>
+
<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
@@ -49,6 +53,21 @@
<version>1.48</version>
<optional>true</optional>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>pdfbox</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>${lucene.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <version>${lucene.version}</version>
+ </dependency>
</dependencies>
<build>
Added: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java?rev=1530357&view=auto
==============================================================================
--- pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java (added)
+++ pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java Tue Oct 8 17:15:09 2013
@@ -0,0 +1,234 @@
+package org.apache.pdfbox.examples.lucene;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ * Index all pdf files under a directory.
+ * <p>
+ * This is a command-line application demonstrating simple Lucene indexing. Run it with no command-line arguments for
+ * usage information.
+ * <p>
+ * It's based on a demo provided by the lucene project.
+ */
+public class IndexPDFFiles
+{
+
+ private IndexPDFFiles()
+ {
+ }
+
+ /**
+ * Index all text files under a directory.
+ *
+ * @param args command line arguments
+ *
+ */
+ public static void main(String[] args)
+ {
+ String usage = "java org.apache.pdfbox.lucene.IndexPDFFiles"
+ + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
+ + "This indexes all PDF documents in DOCS_PATH, creating a Lucene index"
+ + "in INDEX_PATH that can be searched with SearchFiles";
+ String indexPath = "index";
+ String docsPath = null;
+ boolean create = true;
+ for (int i = 0; i < args.length; i++)
+ {
+ if ("-index".equals(args[i]))
+ {
+ indexPath = args[i + 1];
+ i++;
+ }
+ else if ("-docs".equals(args[i]))
+ {
+ docsPath = args[i + 1];
+ i++;
+ }
+ else if ("-update".equals(args[i]))
+ {
+ create = false;
+ }
+ }
+
+ if (docsPath == null)
+ {
+ System.err.println("Usage: " + usage);
+ System.exit(1);
+ }
+
+ final File docDir = new File(docsPath);
+ if (!docDir.exists() || !docDir.canRead())
+ {
+ System.out.println("Document directory '" + docDir.getAbsolutePath()
+ + "' does not exist or is not readable, please check the path");
+ System.exit(1);
+ }
+
+ Date start = new Date();
+ try
+ {
+ System.out.println("Indexing to directory '" + indexPath + "'...");
+
+ Directory dir = FSDirectory.open(new File(indexPath));
+ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
+ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);
+
+ if (create)
+ {
+ // Create a new index in the directory, removing any
+ // previously indexed documents:
+ iwc.setOpenMode(OpenMode.CREATE);
+ }
+ else
+ {
+ // Add new documents to an existing index:
+ iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
+ }
+
+ // Optional: for better indexing performance, if you
+ // are indexing many documents, increase the RAM
+ // buffer. But if you do this, increase the max heap
+ // size to the JVM (eg add -Xmx512m or -Xmx1g):
+ //
+ // iwc.setRAMBufferSizeMB(256.0);
+
+ IndexWriter writer = new IndexWriter(dir, iwc);
+ indexDocs(writer, docDir);
+
+ // NOTE: if you want to maximize search performance,
+ // you can optionally call forceMerge here. This can be
+ // a terribly costly operation, so generally it's only
+ // worth it when your index is relatively static (ie
+ // you're done adding documents to it):
+ //
+ // writer.forceMerge(1);
+
+ writer.close();
+
+ Date end = new Date();
+ System.out.println(end.getTime() - start.getTime() + " total milliseconds");
+
+ }
+ catch (IOException e)
+ {
+ System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
+ }
+ }
+
+ /**
+ * Indexes the given file using the given writer, or if a directory is given, recurses over files and directories
+ * found under the given directory.
+ *
+ * NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents
+ * into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one
+ * document per line, using the <a
+ * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
+ * >WriteLineDocTask</a>.
+ *
+ * @param writer Writer to the index where the given file/dir info will be stored
+ * @param file The file to index, or the directory to recurse into to find files to index
+ * @throws IOException If there is a low-level I/O error
+ */
+ static void indexDocs(IndexWriter writer, File file) throws IOException
+ {
+ // do not try to index files that cannot be read
+ if (file.canRead())
+ {
+ if (file.isDirectory())
+ {
+ String[] files = file.list();
+ // an IO error could occur
+ if (files != null)
+ {
+ for (int i = 0; i < files.length; i++)
+ {
+ indexDocs(writer, new File(file, files[i]));
+ }
+ }
+ }
+ else
+ {
+
+ FileInputStream fis;
+ try
+ {
+ fis = new FileInputStream(file);
+ }
+ catch (FileNotFoundException fnfe)
+ {
+ // at least on windows, some temporary files raise this exception with an "access denied" message
+ // checking if the file can be read doesn't help
+ return;
+ }
+
+ try
+ {
+
+ String path = file.getName().toUpperCase();
+ Document doc = null;
+ if (path.endsWith(".PDF"))
+ {
+ System.out.println("Indexing PDF document: " + file);
+ doc = LucenePDFDocument.getDocument(file);
+ }
+ else
+ {
+ System.out.println("Skipping " + file);
+ return;
+ }
+
+ if (writer.getConfig().getOpenMode() == OpenMode.CREATE)
+ {
+ // New index, so we just add the document (no old document can be there):
+ System.out.println("adding " + file);
+ writer.addDocument(doc);
+ }
+ else
+ {
+ // Existing index (an old copy of this document may have been indexed) so
+ // we use updateDocument instead to replace the old one matching the exact
+ // path, if present:
+ System.out.println("updating " + file);
+ writer.updateDocument(new Term("uid", LucenePDFDocument.createUID(file)), doc);
+ }
+ }
+ finally
+ {
+ fis.close();
+ }
+ }
+ }
+ }
+}
Propchange: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java
------------------------------------------------------------------------------
svn:eol-style = native
Copied: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java (from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java)
URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java?p2=pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java&p1=pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java&r1=1502161&r2=1530357&rev=1530357&view=diff
==============================================================================
--- pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java (original)
+++ pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java Tue Oct 8 17:15:09 2013
@@ -14,272 +14,268 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.pdfbox.lucene;
+package org.apache.pdfbox.examples.lucene;
import java.io.File;
import java.io.FileInputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
-import java.util.Calendar;
-
import java.net.URL;
import java.net.URLConnection;
-
+import java.util.Calendar;
import java.util.Date;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentInformation;
-
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
-
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
/**
- * This class is used to create a document for the lucene search engine.
- * This should easily plug into the IndexHTML or IndexFiles that comes with
- * the lucene project. This class will populate the following fields.
+ * This class is used to create a document for the lucene search engine. This should easily plug into the IndexPDFFiles
+ * that comes with the lucene project. This class will populate the following fields.
* <table>
- * <tr>
- * <th>Lucene Field Name</th>
- * <th>Description</th>
- * </tr>
- * <tr>
- * <td>path</td>
- * <td>File system path if loaded from a file</td>
- * </tr>
- * <tr>
- * <td>url</td>
- * <td>URL to PDF document</td>
- * </tr>
- * <tr>
- * <td>contents</td>
- * <td>Entire contents of PDF document, indexed but not stored</td>
- * </tr>
- * <tr>
- * <td>summary</td>
- * <td>First 500 characters of content</td>
- * </tr>
- * <tr>
- * <td>modified</td>
- * <td>The modified date/time according to the url or path</td>
- * </tr>
- * <tr>
- * <td>uid</td>
- * <td>A unique identifier for the Lucene document.</td>
- * </tr>
- * <tr>
- * <td>CreationDate</td>
- * <td>From PDF meta-data if available</td>
- * </tr>
- * <tr>
- * <td>Creator</td>
- * <td>From PDF meta-data if available</td>
- * </tr>
- * <tr>
- * <td>Keywords</td>
- * <td>From PDF meta-data if available</td>
- * </tr>
- * <tr>
- * <td>ModificationDate</td>
- * <td>From PDF meta-data if available</td>
- * </tr>
- * <tr>
- * <td>Producer</td>
- * <td>From PDF meta-data if available</td>
- * </tr>
- * <tr>
- * <td>Subject</td>
- * <td>From PDF meta-data if available</td>
- * </tr>
- * <tr>
- * <td>Trapped</td>
- * <td>From PDF meta-data if available</td>
- * </tr>
+ * <tr>
+ * <th>Lucene Field Name</th>
+ * <th>Description</th>
+ * </tr>
+ * <tr>
+ * <td>path</td>
+ * <td>File system path if loaded from a file</td>
+ * </tr>
+ * <tr>
+ * <td>url</td>
+ * <td>URL to PDF document</td>
+ * </tr>
+ * <tr>
+ * <td>contents</td>
+ * <td>Entire contents of PDF document, indexed but not stored</td>
+ * </tr>
+ * <tr>
+ * <td>summary</td>
+ * <td>First 500 characters of content</td>
+ * </tr>
+ * <tr>
+ * <td>modified</td>
+ * <td>The modified date/time according to the url or path</td>
+ * </tr>
+ * <tr>
+ * <td>uid</td>
+ * <td>A unique identifier for the Lucene document.</td>
+ * </tr>
+ * <tr>
+ * <td>CreationDate</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Creator</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Keywords</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>ModificationDate</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Producer</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Subject</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Trapped</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
* </table>
- *
+ *
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- * @version $Revision: 1.23 $
+ *
*/
public class LucenePDFDocument
{
private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
// given caveat of increased search times when using
- //MICROSECOND, only use SECOND by default
- private DateTools.Resolution dateTimeResolution = DateTools.Resolution.SECOND;
+ // MICROSECOND, only use SECOND by default
+ private static final DateTools.Resolution DATE_TIME_RES = DateTools.Resolution.SECOND;
private PDFTextStripper stripper = null;
- /**
- * Constructor.
- */
- public LucenePDFDocument()
+ private boolean useNonSeqParser;
+
+ /** not Indexed, tokenized, stored. */
+ public static final FieldType TYPE_STORED_NOT_INDEXED = new FieldType();
+
+ static
{
+ TYPE_STORED_NOT_INDEXED.setIndexed(false);
+ TYPE_STORED_NOT_INDEXED.setStored(true);
+ TYPE_STORED_NOT_INDEXED.setTokenized(true);
+ TYPE_STORED_NOT_INDEXED.freeze();
}
/**
- * Set the text stripper that will be used during extraction.
- *
- * @param aStripper The new pdf text stripper.
+ * Constructor.
*/
- public void setTextStripper( PDFTextStripper aStripper )
+ public LucenePDFDocument()
{
- stripper = aStripper;
+ this(false);
}
/**
- * Get the Lucene data time resolution.
- *
- * @return current date/time resolution
+ * Constructor.
+ *
+ * @param nonSequentialParser indicates if the non-sequential parser should be used
+ *
*/
- public DateTools.Resolution getDateTimeResolution()
+ public LucenePDFDocument(boolean nonSequentialParser)
{
- return dateTimeResolution;
+ useNonSeqParser = nonSequentialParser;
}
/**
- * Set the Lucene data time resolution.
- *
- * @param resolution set new date/time resolution
+ * Set the text stripper that will be used during extraction.
+ *
+ * @param aStripper The new pdf text stripper.
*/
- public void setDateTimeResolution( DateTools.Resolution resolution )
+ public void setTextStripper(PDFTextStripper aStripper)
{
- dateTimeResolution = resolution;
+ stripper = aStripper;
}
- //
- // compatibility methods for lucene-1.9+
- //
- private String timeToString( long time )
+ private static String timeToString(long time)
{
- return DateTools.timeToString( time, dateTimeResolution );
+ return DateTools.timeToString(time, DATE_TIME_RES);
}
- private void addKeywordField( Document document, String name, String value )
+ private void addKeywordField(Document document, String name, String value)
{
- if ( value != null )
+ if (value != null)
{
- document.add( new Field( name, value, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
+ document.add(new StringField(name, value, Field.Store.YES));
}
}
- private void addTextField( Document document, String name, Reader value )
+ private void addTextField(Document document, String name, Reader value)
{
- if ( value != null )
+ if (value != null)
{
- document.add( new Field( name, value ) );
+ document.add(new TextField(name, value));
}
}
- private void addTextField( Document document, String name, String value )
+ private void addTextField(Document document, String name, String value)
{
- if ( value != null )
+ if (value != null)
{
- document.add( new Field( name, value, Field.Store.YES, Field.Index.ANALYZED ) );
+ document.add(new TextField(name, value, Field.Store.YES));
}
}
- private void addTextField( Document document, String name, Date value )
+ private void addTextField(Document document, String name, Date value)
{
- if ( value != null )
+ if (value != null)
{
- addTextField( document, name, DateTools.dateToString( value, dateTimeResolution ) );
+ addTextField(document, name, DateTools.dateToString(value, DATE_TIME_RES));
}
}
- private void addTextField( Document document, String name, Calendar value )
+ private void addTextField(Document document, String name, Calendar value)
{
- if ( value != null )
+ if (value != null)
{
- addTextField( document, name, value.getTime() );
+ addTextField(document, name, value.getTime());
}
}
- private static void addUnindexedField( Document document, String name, String value )
+ private static void addUnindexedField(Document document, String name, String value)
{
- if ( value != null )
+ if (value != null)
{
- document.add( new Field( name, value, Field.Store.YES, Field.Index.NO ) );
+ document.add(new Field(name, value, TYPE_STORED_NOT_INDEXED));
}
}
- private void addUnstoredKeywordField( Document document, String name, String value )
+ private void addUnstoredKeywordField(Document document, String name, String value)
{
- if ( value != null )
+ if (value != null)
{
- document.add( new Field( name, value, Field.Store.NO, Field.Index.NOT_ANALYZED ) );
+ document.add(new Field(name, value, TextField.TYPE_NOT_STORED));
}
}
/**
* Convert the PDF stream to a lucene document.
- *
+ *
* @param is The input stream.
* @return The input stream converted to a lucene document.
* @throws IOException If there is an error converting the PDF.
*/
- public Document convertDocument( InputStream is ) throws IOException
+ public Document convertDocument(InputStream is) throws IOException
{
Document document = new Document();
- addContent( document, is, "<inputstream>" );
+ addContent(document, is, "<inputstream>");
return document;
}
/**
* This will take a reference to a PDF document and create a lucene document.
- *
+ *
* @param file A reference to a PDF document.
* @return The converted lucene document.
- *
+ *
* @throws IOException If there is an exception while converting the document.
*/
- public Document convertDocument( File file ) throws IOException
+ public Document convertDocument(File file) throws IOException
{
Document document = new Document();
- // Add the url as a field named "url". Use an UnIndexed field, so
+ // Add the url as a field named "url". Use an UnIndexed field, so
// that the url is just stored with the document, but is not searchable.
- addUnindexedField( document, "path", file.getPath() );
- addUnindexedField( document, "url", file.getPath().replace(FILE_SEPARATOR, '/') );
+ addUnindexedField(document, "path", file.getPath());
+ addUnindexedField(document, "url", file.getPath().replace(FILE_SEPARATOR, '/'));
- // Add the last modified date of the file a field named "modified". Use a
+ // Add the last modified date of the file a field named "modified". Use a
// Keyword field, so that it's searchable, but so that no attempt is made
// to tokenize the field into words.
- addKeywordField( document, "modified", timeToString( file.lastModified() ) );
+ addKeywordField(document, "modified", timeToString(file.lastModified()));
- String uid = file.getPath().replace(FILE_SEPARATOR,'\u0000')
- + "\u0000"
- + timeToString( file.lastModified() );
+ String uid = createUID(file);
// Add the uid as a field, so that index can be incrementally maintained.
// This field is not stored with document, it is indexed, but it is not
// tokenized prior to indexing.
- addUnstoredKeywordField( document, "uid", uid );
+ addUnstoredKeywordField(document, "uid", uid);
FileInputStream input = null;
try
{
- input = new FileInputStream( file );
- addContent( document, input, file.getPath() );
+ input = new FileInputStream(file);
+ addContent(document, input, file.getPath());
}
finally
{
- if( input != null )
+ if (input != null)
{
input.close();
}
}
-
// return the document
return document;
@@ -287,43 +283,41 @@ public class LucenePDFDocument
/**
* Convert the document from a PDF to a lucene document.
- *
+ *
* @param url A url to a PDF document.
* @return The PDF converted to a lucene document.
* @throws IOException If there is an error while converting the document.
*/
- public Document convertDocument( URL url ) throws IOException
+ public Document convertDocument(URL url) throws IOException
{
Document document = new Document();
URLConnection connection = url.openConnection();
connection.connect();
- // Add the url as a field named "url". Use an UnIndexed field, so
+ // Add the url as a field named "url". Use an UnIndexed field, so
// that the url is just stored with the document, but is not searchable.
- addUnindexedField( document, "url", url.toExternalForm() );
+ addUnindexedField(document, "url", url.toExternalForm());
- // Add the last modified date of the file a field named "modified". Use a
+ // Add the last modified date of the file a field named "modified". Use a
// Keyword field, so that it's searchable, but so that no attempt is made
// to tokenize the field into words.
- addKeywordField( document, "modified", timeToString(connection.getLastModified() ) );
+ addKeywordField(document, "modified", timeToString(connection.getLastModified()));
- String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000')
- + "\u0000"
- + timeToString( connection.getLastModified() );
+ String uid = createUID(url, connection.getLastModified());
// Add the uid as a field, so that index can be incrementally maintained.
// This field is not stored with document, it is indexed, but it is not
// tokenized prior to indexing.
- addUnstoredKeywordField( document, "uid", uid );
+ addUnstoredKeywordField(document, "uid", uid);
InputStream input = null;
try
{
input = connection.getInputStream();
- addContent( document, input,url.toExternalForm() );
+ addContent(document, input, url.toExternalForm());
}
finally
{
- if( input != null )
+ if (input != null)
{
input.close();
}
@@ -335,74 +329,126 @@ public class LucenePDFDocument
/**
* This will get a lucene document from a PDF file.
- *
+ *
* @param is The stream to read the PDF from.
- *
+ *
* @return The lucene document.
- *
+ *
* @throws IOException If there is an error parsing or indexing the document.
*/
- public static Document getDocument( InputStream is ) throws IOException
+ public static Document getDocument(InputStream is) throws IOException
{
- LucenePDFDocument converter = new LucenePDFDocument();
- return converter.convertDocument( is );
+ return getDocument(is, false);
}
/**
* This will get a lucene document from a PDF file.
- *
+ *
+ * @param is The stream to read the PDF from.
+ * @param nonSeqParser indicates if the non-sequential parser should be used
+ *
+ * @return The lucene document.
+ *
+ * @throws IOException If there is an error parsing or indexing the document.
+ */
+ public static Document getDocument(InputStream is, boolean nonSeqParser) throws IOException
+ {
+ LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser);
+ return converter.convertDocument(is);
+ }
+
+ /**
+ * This will get a lucene document from a PDF file.
+ *
+ * @param file The file to get the document for.
+ *
+ * @return The lucene document.
+ *
+ * @throws IOException If there is an error parsing or indexing the document.
+ */
+ public static Document getDocument(File file) throws IOException
+ {
+ return getDocument(file, false);
+ }
+
+ /**
+ * This will get a lucene document from a PDF file.
+ *
* @param file The file to get the document for.
- *
+ * @param nonSeqParser indicates if the non-sequential parser should be used
+ *
* @return The lucene document.
- *
+ *
* @throws IOException If there is an error parsing or indexing the document.
*/
- public static Document getDocument( File file ) throws IOException
+ public static Document getDocument(File file, boolean nonSeqParser) throws IOException
{
- LucenePDFDocument converter = new LucenePDFDocument();
- return converter.convertDocument( file );
+ LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser);
+ return converter.convertDocument(file);
}
/**
* This will get a lucene document from a PDF file.
- *
+ *
* @param url The file to get the document for.
- *
+ *
* @return The lucene document.
- *
+ *
* @throws IOException If there is an error parsing or indexing the document.
*/
- public static Document getDocument( URL url ) throws IOException
+ public static Document getDocument(URL url) throws IOException
{
- LucenePDFDocument converter = new LucenePDFDocument();
- return converter.convertDocument( url );
+ return getDocument(url, false);
+ }
+
+ /**
+ * This will get a lucene document from a PDF file.
+ *
+ * @param url The file to get the document for.
+ * @param nonSeqParser indicates if the non-sequential parser should be used
+ *
+ * @return The lucene document.
+ *
+ * @throws IOException If there is an error parsing or indexing the document.
+ */
+ public static Document getDocument(URL url, boolean nonSeqParser) throws IOException
+ {
+ LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser);
+ return converter.convertDocument(url);
}
/**
* This will add the contents to the lucene document.
- *
+ *
* @param document The document to add the contents to.
* @param is The stream to get the contents from.
* @param documentLocation The location of the document, used just for debug messages.
- *
+ *
* @throws IOException If there is an error parsing the document.
*/
- private void addContent( Document document, InputStream is, String documentLocation ) throws IOException
+ private void addContent(Document document, InputStream is, String documentLocation) throws IOException
{
PDDocument pdfDocument = null;
try
{
- pdfDocument = PDDocument.load( is );
-
- if( pdfDocument.isEncrypted() )
+ if (useNonSeqParser)
{
- //Just try using the default password and move on
- pdfDocument.decrypt( "" );
+ pdfDocument = PDDocument.loadNonSeq(is, null, "");
+ }
+ else
+ {
+ pdfDocument = PDDocument.load(is);
+
+ if (pdfDocument.isEncrypted())
+ {
+ // Just try using the default password and move on
+ pdfDocument.decrypt("");
+ }
}
- //create a writer where to append the text content.
+ // create a writer where to append the text content.
StringWriter writer = new StringWriter();
- if( stripper == null )
+ if (stripper == null)
{
stripper = new PDFTextStripper();
}
@@ -410,7 +456,7 @@ public class LucenePDFDocument
{
stripper.resetEngine();
}
- stripper.writeText( pdfDocument, writer );
+ stripper.writeText(pdfDocument, writer);
// Note: the buffer to string operation is costless;
// the char array value of the writer buffer and the content string
@@ -418,59 +464,57 @@ public class LucenePDFDocument
// not occur here.
String contents = writer.getBuffer().toString();
- StringReader reader = new StringReader( contents );
+ StringReader reader = new StringReader(contents);
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
- addTextField( document, "contents", reader );
+ addTextField(document, "contents", reader);
PDDocumentInformation info = pdfDocument.getDocumentInformation();
- if( info != null )
+ if (info != null)
{
- addTextField( document, "Author", info.getAuthor() );
+ addTextField(document, "Author", info.getAuthor());
try
{
- addTextField( document, "CreationDate", info.getCreationDate() );
+ addTextField(document, "CreationDate", info.getCreationDate());
}
- catch( IOException io )
+ catch (IOException io)
{
- //ignore, bad date but continue with indexing
+ // ignore, bad date but continue with indexing
}
- addTextField( document, "Creator", info.getCreator() );
- addTextField( document, "Keywords", info.getKeywords() );
+ addTextField(document, "Creator", info.getCreator());
+ addTextField(document, "Keywords", info.getKeywords());
try
{
- addTextField( document, "ModificationDate", info.getModificationDate() );
+ addTextField(document, "ModificationDate", info.getModificationDate());
}
- catch( IOException io )
+ catch (IOException io)
{
- //ignore, bad date but continue with indexing
+ // ignore, bad date but continue with indexing
}
- addTextField( document, "Producer", info.getProducer() );
- addTextField( document, "Subject", info.getSubject() );
- addTextField( document, "Title", info.getTitle() );
- addTextField( document, "Trapped", info.getTrapped() );
+ addTextField(document, "Producer", info.getProducer());
+ addTextField(document, "Subject", info.getSubject());
+ addTextField(document, "Title", info.getTitle());
+ addTextField(document, "Trapped", info.getTrapped());
}
- int summarySize = Math.min( contents.length(), 500 );
- String summary = contents.substring( 0, summarySize );
+ int summarySize = Math.min(contents.length(), 500);
+ String summary = contents.substring(0, summarySize);
// Add the summary as an UnIndexed field, so that it is stored and returned
// with hit documents for display.
- addUnindexedField( document, "summary", summary );
+ addUnindexedField(document, "summary", summary);
}
- catch( CryptographyException e )
+ catch (CryptographyException e)
{
- throw new IOException( "Error decrypting document(" + documentLocation + "): " + e );
+ throw new IOException("Error decrypting document(" + documentLocation + "): " + e);
}
- catch( InvalidPasswordException e )
+ catch (InvalidPasswordException e)
{
- //they didn't suppply a password and the default of "" was wrong.
- throw new IOException(
- "Error: The document(" + documentLocation +
- ") is encrypted and will not be indexed." );
+ // they didn't suppply a password and the default of "" was wrong.
+ throw new IOException("Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
}
finally
{
- if( pdfDocument != null )
+ if (pdfDocument != null)
{
pdfDocument.close();
}
@@ -478,22 +522,27 @@ public class LucenePDFDocument
}
/**
- * This will test creating a document.
- *
- * usage: java pdfparser.searchengine.lucene.LucenePDFDocument <pdf-document>
- *
- * @param args command line arguments.
- *
- * @throws IOException If there is an error.
- */
- public static void main( String[] args ) throws IOException
- {
- if( args.length != 1 )
- {
- String us = LucenePDFDocument.class.getName();
- System.err.println( "usage: java " + us + " <pdf-document>" );
- System.exit( 1 );
- }
- System.out.println( "Document=" + getDocument( new File( args[0] ) ) );
+ * Create an UID for the given file using the given time.
+ *
+ * @param file the file we have to create an UID for
+ * @param time the time to used to the UID
+ *
+ * @return the created UID
+ */
+ public static String createUID(URL url, long time)
+ {
+ return url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + timeToString(time);
+ }
+
+ /**
+ * Create an UID for the given file.
+ *
+ * @param file the file we have to create an UID for
+ *
+ * @return the created UID
+ */
+ public static String createUID(File file)
+ {
+ return file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + timeToString(file.lastModified());
}
}
Copied: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html (from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html)
URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html?p2=pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html&p1=pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html&r1=1502161&r2=1530357&rev=1530357&view=diff
==============================================================================
--- pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html (original)
+++ pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html Tue Oct 8 17:15:09 2013
@@ -20,6 +20,6 @@
</head>
<body>
-This package holds classes that are used to integrate the PDFBox project with lucene.
+This example shows how to to integrate the PDFBox project with lucene.
</body>
</html>
Modified: pdfbox/trunk/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pom.xml?rev=1530357&r1=1530356&r2=1530357&view=diff
==============================================================================
--- pdfbox/trunk/pom.xml (original)
+++ pdfbox/trunk/pom.xml Tue Oct 8 17:15:09 2013
@@ -50,7 +50,6 @@
<module>pdfbox</module>
<module>preflight</module>
<module>preflight-app</module>
- <module>lucene</module>
<module>ant</module>
<module>war</module>
<module>app</module>