You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/02/14 00:08:48 UTC

svn commit: r744277 - in /lucene/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/ParsingReader.java src/test/java/org/apache/tika/parser/ParsingReaderTest.java

Author: jukka
Date: Fri Feb 13 23:08:45 2009
New Revision: 744277

URL: http://svn.apache.org/viewvc?rev=744277&view=rev
Log:
TIKA-203: Earlier metadata extraction in ParsingReader

Use buffering to force the parser to extract at least one character of text before returning from the ParsingReader constructor. This way most metadata will be immediately available to the client application, as many document formats put metadata at the beginning before normal document content.

Modified:
    lucene/tika/trunk/CHANGES.txt
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
    lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java

Modified: lucene/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=744277&r1=744276&r2=744277&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Fri Feb 13 23:08:45 2009
@@ -28,6 +28,11 @@
   * The text content of Microsoft Outlook message files no longer appears as
     multiple copies in the extracted text. (TIKA-197)
 
+  * The ParsingReader class now makes most document metadata available
+    already before any of the extracted text is consumed. This makes it
+    easier for example to construct Lucene Document instances that contain
+    both extracted text and metadata. (TIKA-203)
+
 See http://tinyurl.com/tika-0-3-changes for a list of all changes in Tika 0.3.
 
 The following people have contributed to Tika 0.3 by submitting or commenting

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java?rev=744277&r1=744276&r2=744277&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java Fri Feb 13 23:08:45 2009
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser;
 
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -24,6 +25,7 @@
 import java.io.PipedReader;
 import java.io.PipedWriter;
 import java.io.Reader;
+import java.io.Writer;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.BodyContentHandler;
@@ -47,14 +49,14 @@
     private final Parser parser;
 
     /**
-     * Read end of the pipe.
+     * Buffered read end of the pipe.
      */
-    private final PipedReader reader;
+    private final Reader reader;
 
     /**
      * Write end of the pipe.
      */
-    private final PipedWriter writer;
+    private final Writer writer;
 
     /**
      * The binary stream being parsed.
@@ -90,8 +92,9 @@
      * Creates a reader for the text content of the given binary stream.
      *
      * @param stream binary stream
+     * @throws IOException if the document can not be parsed
      */
-    public ParsingReader(InputStream stream) {
+    public ParsingReader(InputStream stream) throws IOException {
         this(new AutoDetectParser(), stream, new Metadata());
     }
 
@@ -101,8 +104,9 @@
      *
      * @param stream binary stream
      * @param name document name
+     * @throws IOException if the document can not be parsed
      */
-    public ParsingReader(InputStream stream, String name) {
+    public ParsingReader(InputStream stream, String name) throws IOException {
         this(new AutoDetectParser(), stream, getMetadata(name));
     }
 
@@ -110,8 +114,10 @@
      * Creates a reader for the text content of the given file.
      *
      * @param file file
+     * @throws FileNotFoundException if the given file does not exist
+     * @throws IOException if the document can not be parsed
      */
-    public ParsingReader(File file) throws FileNotFoundException {
+    public ParsingReader(File file) throws FileNotFoundException, IOException {
         this(new FileInputStream(file), file.getName());
     }
 
@@ -123,12 +129,15 @@
      * @param parser parser instance
      * @param stream binary stream
      * @param metadata document metadata
+     * @throws IOException if the document can not be parsed
      */
-    public ParsingReader(Parser parser, InputStream stream, Metadata metadata) {
+    public ParsingReader(Parser parser, InputStream stream, Metadata metadata)
+            throws IOException {
         this.parser = parser;
-        this.reader = new PipedReader();
+        PipedReader pipedReader = new PipedReader();
+        this.reader = new BufferedReader(pipedReader);
         try {
-            this.writer = new PipedWriter(reader);
+            this.writer = new PipedWriter(pipedReader);
         } catch (IOException e) {
             throw new IllegalStateException(e); // Should never happen
         }
@@ -142,6 +151,11 @@
             name = "Apache Tika";
         }
         new Thread(new ParsingThread(), name).start();
+
+        // TIKA-203: Buffer first character to force metadata extraction
+        reader.mark(1);
+        reader.read();
+        reader.reset();
     }
 
     /**

Modified: lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=744277&r1=744276&r2=744277&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java (original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java Fri Feb 13 23:08:45 2009
@@ -20,6 +20,8 @@
 import java.io.InputStream;
 import java.io.Reader;
 
+import org.apache.tika.metadata.Metadata;
+
 import junit.framework.TestCase;
 
 public class ParsingReaderTest extends TestCase {
@@ -68,4 +70,30 @@
         assertEquals(-1, stream.read());
     }
 
+    /**
+     * Test case for TIKA-203
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
+     */
+    public void testMetadata() throws Exception {
+        Metadata metadata = new Metadata();
+        InputStream stream = ParsingReaderTest.class.getResourceAsStream(
+                "/test-documents/testEXCEL.xls");
+        Reader reader =
+            new ParsingReader(new AutoDetectParser(), stream, metadata);
+        try {
+            // Metadata should already be available
+            assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
+            // Check that the internal buffering isn't broken
+            assertEquals('F', (char) reader.read());
+            assertEquals('e', (char) reader.read());
+            assertEquals('u', (char) reader.read());
+            assertEquals('i', (char) reader.read());
+            assertEquals('l', (char) reader.read());
+            assertEquals('1', (char) reader.read());
+        } finally {
+            reader.close();
+        }
+    }
+
 }