You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/12/01 00:15:11 UTC

svn commit: r721926 [2/5] - in /lucene/tika/trunk/src: main/java/org/apache/tika/cli/ main/java/org/apache/tika/gui/ main/java/org/apache/tika/mime/ main/java/org/apache/tika/parser/ main/java/org/apache/tika/parser/audio/ main/java/org/apache/tika/par...

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java Sun Nov 30 15:15:09 2008
@@ -1,219 +1,219 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PipedReader;
-import java.io.PipedWriter;
-import java.io.Reader;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
-
-/**
- * Reader for the text content from a given binary stream. This class
- * starts a background thread and uses a {@link Parser}
- * ({@link AutoDetectParser) by default) to parse the text content from
- * a given input stream. The {@link BodyContentHandler} class and a pipe
- * is used to convert the push-based SAX event stream to the pull-based
- * character stream defined by the {@link Reader} interface.
- *
- * @since Apache Tika 0.2
- */
-public class ParsingReader extends Reader {
-
-    /**
-     * Parser instance used for parsing the given binary stream.
-     */
-    private final Parser parser;
-
-    /**
-     * Read end of the pipe.
-     */
-    private final PipedReader reader;
-
-    /**
-     * Write end of the pipe.
-     */
-    private final PipedWriter writer;
-
-    /**
-     * The binary stream being parsed.
-     */
-    private final InputStream stream;
-
-    /**
-     * Metadata associated with the document being parsed.
-     */
-    private final Metadata metadata;
-
-    /**
-     * An exception (if any) thrown by the parsing thread.
-     */
-    private Throwable throwable;
-
-    /**
-     * Utility method that returns a {@link Metadata} instance
-     * for a document with the given name.
-     *
-     * @param name resource name (or <code>null</code>)
-     * @return metadata instance
-     */
-    private static Metadata getMetadata(String name) {
-        Metadata metadata = new Metadata();
-        if (name != null && name.length() > 0) {
-            metadata.set(Metadata.RESOURCE_NAME_KEY, name);
-        }
-        return metadata;
-    }
-
-    /**
-     * Creates a reader for the text content of the given binary stream.
-     *
-     * @param stream binary stream
-     */
-    public ParsingReader(InputStream stream) {
-        this(new AutoDetectParser(), stream, new Metadata());
-    }
-
-    /**
-     * Creates a reader for the text content of the given binary stream
-     * with the given name.
-     *
-     * @param stream binary stream
-     * @param name document name
-     */
-    public ParsingReader(InputStream stream, String name) {
-        this(new AutoDetectParser(), stream, getMetadata(name));
-    }
-
-    /**
-     * Creates a reader for the text content of the given file.
-     *
-     * @param file file
-     */
-    public ParsingReader(File file) throws FileNotFoundException {
-        this(new FileInputStream(file), file.getName());
-    }
-
-    /**
-     * Creates a reader for the text content of the given binary stream
-     * with the given document metadata. The given parser is used for
-     * parsing.
-     *
-     * @param parser parser instance
-     * @param stream binary stream
-     * @param metadata document metadata
-     */
-    public ParsingReader(Parser parser, InputStream stream, Metadata metadata) {
-        this.parser = parser;
-        this.reader = new PipedReader();
-        try {
-            this.writer = new PipedWriter(reader);
-        } catch (IOException e) {
-            throw new IllegalStateException(e); // Should never happen
-        }
-        this.stream = stream;
-        this.metadata = metadata;
-
-        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-        if (name != null) {
-            name = "Apache Tika: " + name;
-        } else {
-            name = "Apache Tika";
-        }
-        new Thread(new ParsingThread(), name).start();
-    }
-
-    /**
-     * The background parsing thread.
-     */
-    private class ParsingThread implements Runnable {
-
-        /**
-         * Parses the given binary stream and writes the text content
-         * to the write end of the pipe. Potential exceptions (including
-         * the one caused if the read end is closed unexpectedly) are
-         * stored before the input stream is closed and processing is stopped.
-         */
-        public void run() {
-            try {
-                ContentHandler handler = new BodyContentHandler(writer);
-                parser.parse(stream, handler, metadata);
-            } catch (Throwable t) {
-                throwable = t;
-            }
-
-            try {
-                stream.close();
-            } catch (Throwable t) {
-                if (throwable == null) {
-                    throwable = t;
-                }
-            }
-
-            try {
-                writer.close();
-            } catch (Throwable t) {
-                if (throwable == null) {
-                    throwable = t;
-                }
-            }
-        }
-
-    }
-
-    /**
-     * Reads parsed text from the pipe connected to the parsing thread.
-     * Fails if the parsing thread has thrown an exception.
-     *
-     * @param cbuff character buffer
-     * @param off start offset within the buffer
-     * @param len maximum number of characters to read
-     * @throws IOException if the parsing thread has failed or
-     *                     if for some reason the pipe does not work properly
-     */
-    @Override
-    public int read(char[] cbuf, int off, int len) throws IOException {
-        if (throwable instanceof IOException) {
-            throw (IOException) throwable;
-        } else if (throwable != null) {
-            IOException exception = new IOException("");
-            exception.initCause(throwable);
-            throw exception;
-        }
-        return reader.read(cbuf, off, len);
-    }
-
-    /**
-     * Closes the read end of the pipe. If the parsing thread is still
-     * running, next write to the pipe will fail and cause the thread
-     * to stop. Thus there is no need to explicitly terminate the thread.
-     *
-     * @throws IOException if the pipe can not be closed
-     */
-    @Override
-    public void close() throws IOException {
-        reader.close();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PipedReader;
+import java.io.PipedWriter;
+import java.io.Reader;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Reader for the text content from a given binary stream. This class
+ * starts a background thread and uses a {@link Parser}
+ * ({@link AutoDetectParser) by default) to parse the text content from
+ * a given input stream. The {@link BodyContentHandler} class and a pipe
+ * is used to convert the push-based SAX event stream to the pull-based
+ * character stream defined by the {@link Reader} interface.
+ *
+ * @since Apache Tika 0.2
+ */
+public class ParsingReader extends Reader {
+
+    /**
+     * Parser instance used for parsing the given binary stream.
+     */
+    private final Parser parser;
+
+    /**
+     * Read end of the pipe.
+     */
+    private final PipedReader reader;
+
+    /**
+     * Write end of the pipe.
+     */
+    private final PipedWriter writer;
+
+    /**
+     * The binary stream being parsed.
+     */
+    private final InputStream stream;
+
+    /**
+     * Metadata associated with the document being parsed.
+     */
+    private final Metadata metadata;
+
+    /**
+     * An exception (if any) thrown by the parsing thread.
+     */
+    private Throwable throwable;
+
+    /**
+     * Utility method that returns a {@link Metadata} instance
+     * for a document with the given name.
+     *
+     * @param name resource name (or <code>null</code>)
+     * @return metadata instance
+     */
+    private static Metadata getMetadata(String name) {
+        Metadata metadata = new Metadata();
+        if (name != null && name.length() > 0) {
+            metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+        }
+        return metadata;
+    }
+
+    /**
+     * Creates a reader for the text content of the given binary stream.
+     *
+     * @param stream binary stream
+     */
+    public ParsingReader(InputStream stream) {
+        this(new AutoDetectParser(), stream, new Metadata());
+    }
+
+    /**
+     * Creates a reader for the text content of the given binary stream
+     * with the given name.
+     *
+     * @param stream binary stream
+     * @param name document name
+     */
+    public ParsingReader(InputStream stream, String name) {
+        this(new AutoDetectParser(), stream, getMetadata(name));
+    }
+
+    /**
+     * Creates a reader for the text content of the given file.
+     *
+     * @param file file
+     */
+    public ParsingReader(File file) throws FileNotFoundException {
+        this(new FileInputStream(file), file.getName());
+    }
+
+    /**
+     * Creates a reader for the text content of the given binary stream
+     * with the given document metadata. The given parser is used for
+     * parsing.
+     *
+     * @param parser parser instance
+     * @param stream binary stream
+     * @param metadata document metadata
+     */
+    public ParsingReader(Parser parser, InputStream stream, Metadata metadata) {
+        this.parser = parser;
+        this.reader = new PipedReader();
+        try {
+            this.writer = new PipedWriter(reader);
+        } catch (IOException e) {
+            throw new IllegalStateException(e); // Should never happen
+        }
+        this.stream = stream;
+        this.metadata = metadata;
+
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name != null) {
+            name = "Apache Tika: " + name;
+        } else {
+            name = "Apache Tika";
+        }
+        new Thread(new ParsingThread(), name).start();
+    }
+
+    /**
+     * The background parsing thread.
+     */
+    private class ParsingThread implements Runnable {
+
+        /**
+         * Parses the given binary stream and writes the text content
+         * to the write end of the pipe. Potential exceptions (including
+         * the one caused if the read end is closed unexpectedly) are
+         * stored before the input stream is closed and processing is stopped.
+         */
+        public void run() {
+            try {
+                ContentHandler handler = new BodyContentHandler(writer);
+                parser.parse(stream, handler, metadata);
+            } catch (Throwable t) {
+                throwable = t;
+            }
+
+            try {
+                stream.close();
+            } catch (Throwable t) {
+                if (throwable == null) {
+                    throwable = t;
+                }
+            }
+
+            try {
+                writer.close();
+            } catch (Throwable t) {
+                if (throwable == null) {
+                    throwable = t;
+                }
+            }
+        }
+
+    }
+
+    /**
+     * Reads parsed text from the pipe connected to the parsing thread.
+     * Fails if the parsing thread has thrown an exception.
+     *
+     * @param cbuff character buffer
+     * @param off start offset within the buffer
+     * @param len maximum number of characters to read
+     * @throws IOException if the parsing thread has failed or
+     *                     if for some reason the pipe does not work properly
+     */
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        if (throwable instanceof IOException) {
+            throw (IOException) throwable;
+        } else if (throwable != null) {
+            IOException exception = new IOException("");
+            exception.initCause(throwable);
+            throw exception;
+        }
+        return reader.read(cbuf, off, len);
+    }
+
+    /**
+     * Closes the read end of the pipe. If the parsing thread is still
+     * running, next write to the pipe will fail and cause the thread
+     * to stop. Thus there is no need to explicitly terminate the thread.
+     *
+     * @throws IOException if the pipe can not be closed
+     */
+    @Override
+    public void close() throws IOException {
+        reader.close();
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java Sun Nov 30 15:15:09 2008
@@ -1,77 +1,77 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import javax.xml.XMLConstants;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that downgrades XHTML elements to
- * old-style HTML elements before passing them on to the decorated
- * content handler. This downgrading consists of dropping all namespaces
- * (and namespaced attributes) and uppercasing all element names.
- * Used by the {@link HtmlParser} to make all incoming HTML look the same.
- */
-class XHTMLDowngradeHandler extends ContentHandlerDecorator {
-
-    public XHTMLDowngradeHandler(ContentHandler handler) {
-        super(handler);
-    }
-
-    @Override
-    public void startElement(
-            String uri, String localName, String name, Attributes atts)
-            throws SAXException {
-        String upper = localName.toUpperCase();
-
-        AttributesImpl attributes = new AttributesImpl();
-        for (int i = 0; i < atts.getLength(); i++) {
-            String local = atts.getLocalName(i);
-            String qname = atts.getQName(i);
-            if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
-                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
-                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
-                attributes.addAttribute(
-                        atts.getURI(i), local, qname,
-                        atts.getType(i), atts.getValue(i));
-            }
-        }
-
-        super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String name)
-            throws SAXException {
-        String upper = localName.toUpperCase();
-        super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) {
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+    public XHTMLDowngradeHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes atts)
+            throws SAXException {
+        String upper = localName.toUpperCase();
+
+        AttributesImpl attributes = new AttributesImpl();
+        for (int i = 0; i < atts.getLength(); i++) {
+            String local = atts.getLocalName(i);
+            String qname = atts.getQName(i);
+            if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
+                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+                attributes.addAttribute(
+                        atts.getURI(i), local, qname,
+                        atts.getType(i), atts.getValue(i));
+            }
+        }
+
+        super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name)
+            throws SAXException {
+        String upper = localName.toUpperCase();
+        super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) {
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java Sun Nov 30 15:15:09 2008
@@ -1,58 +1,58 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-
-import javax.imageio.ImageIO;
-import javax.imageio.ImageReader;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ImageParser implements Parser {
-
-    public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
-            throws IOException, SAXException, TikaException {
-        String type = metadata.get(Metadata.CONTENT_TYPE);
-        if (type != null) {
-            Iterator<ImageReader> iterator =
-                ImageIO.getImageReadersByMIMEType(type);
-            if (iterator.hasNext()) {
-                ImageReader reader = iterator.next();
-                reader.setInput(ImageIO.createImageInputStream(
-                        new CloseShieldInputStream(stream)));
-                metadata.set("height", Integer.toString(reader.getHeight(0)));
-                metadata.set("width", Integer.toString(reader.getWidth(0)));
-                reader.dispose();
-            }
-        }
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ImageParser implements Parser {
+
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type != null) {
+            Iterator<ImageReader> iterator =
+                ImageIO.getImageReadersByMIMEType(type);
+            if (iterator.hasNext()) {
+                ImageReader reader = iterator.next();
+                reader.setInput(ImageIO.createImageInputStream(
+                        new CloseShieldInputStream(stream)));
+                metadata.set("height", Integer.toString(reader.getHeight(0)));
+                metadata.set("width", Integer.toString(reader.getWidth(0)));
+                reader.dispose();
+            }
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java Sun Nov 30 15:15:09 2008
@@ -1,38 +1,38 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Cell of content. Classes that implement this interface are used by
- * Tika parsers (currently just the MS Excel parser) to keep track of
- * individual pieces of content before they are rendered to the XHTML
- * SAX event stream.
- */
-public interface Cell {
-
-    /**
-     * Renders the content to the given XHTML SAX event stream.
-     *
-     * @param handler
-     * @throws SAXException
-     */
-    void render(XHTMLContentHandler handler) throws SAXException;
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+    /**
+     * Renders the content to the given XHTML SAX event stream.
+     *
+     * @param handler
+     * @throws SAXException
+     */
+    void render(XHTMLContentHandler handler) throws SAXException;
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Sun Nov 30 15:15:09 2008
@@ -1,37 +1,37 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Cell decorator.
- */
-public class CellDecorator implements Cell {
-
-    private final Cell cell;
-
-    public CellDecorator(Cell cell) {
-        this.cell = cell;
-    }
-
-    public void render(XHTMLContentHandler handler) throws SAXException {
-        cell.render(handler);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+    private final Cell cell;
+
+    public CellDecorator(Cell cell) {
+        this.cell = cell;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        cell.render(handler);
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Sun Nov 30 15:15:09 2008
@@ -1,397 +1,397 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.awt.Point;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Map;
-import java.util.SortedMap;
-import java.util.TreeMap;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
-import org.apache.poi.hssf.eventusermodel.HSSFListener;
-import org.apache.poi.hssf.eventusermodel.HSSFRequest;
-import org.apache.poi.hssf.record.BOFRecord;
-import org.apache.poi.hssf.record.BoundSheetRecord;
-import org.apache.poi.hssf.record.CellValueRecordInterface;
-import org.apache.poi.hssf.record.CountryRecord;
-import org.apache.poi.hssf.record.DateWindow1904Record;
-import org.apache.poi.hssf.record.EOFRecord;
-import org.apache.poi.hssf.record.ExtendedFormatRecord;
-import org.apache.poi.hssf.record.FormatRecord;
-import org.apache.poi.hssf.record.FormulaRecord;
-import org.apache.poi.hssf.record.HyperlinkRecord;
-import org.apache.poi.hssf.record.UnicodeString;
-//import org.apache.poi.hssf.record.HyperlinkRecord;  // FIXME - requires POI release
-import org.apache.poi.hssf.record.LabelRecord;
-import org.apache.poi.hssf.record.LabelSSTRecord;
-import org.apache.poi.hssf.record.NumberRecord;
-import org.apache.poi.hssf.record.RKRecord;
-import org.apache.poi.hssf.record.Record;
-import org.apache.poi.hssf.record.SSTRecord;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Excel parser implementation which uses POI's Event API
- * to handle the contents of a Workbook.
- * <p>
- * The Event API uses a much smaller memory footprint than
- * <code>HSSFWorkbook</code> when processing excel files
- * but at the cost of more complexity.
- * <p>
- * With the Event API a <i>listener</i> is registered for
- * specific record types and those records are created,
- * fired off to the listener and then discarded as the stream
- * is being processed.
- *
- * @see org.apache.poi.hssf.eventusermodel.HSSFListener
- * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
- * POI Event API How To</a>
- */
-public class ExcelExtractor {
-
-    /** Logging instance */
-    private static final Log log = LogFactory.getLog(ExcelExtractor.class);
-
-    /**
-     * <code>true</code> if the HSSFListener should be registered
-     * to listen for all records or <code>false</code> (the default)
-     * if the listener should be configured to only receive specified
-     * records.
-     */
-    private boolean listenForAllRecords = false;
-
-    /**
-     * Returns <code>true</code> if this parser is configured to listen
-     * for all records instead of just the specified few.
-     */
-    public boolean isListenForAllRecords() {
-        return listenForAllRecords;
-    }
-
-    /**
-     * Specifies whether this parser should to listen for all
-     * records or just for the specified few.
-     * <p>
-     * <strong>Note:</strong> Under normal operation this setting should
-     * be <code>false</code> (the default), but you can experiment with
-     * this setting for testing and debugging purposes.
-     *
-     * @param listenForAllRecords <code>true</code> if the HSSFListener
-     * should be registered to listen for all records or <code>false</code>
-     * if the listener should be configured to only receive specified records.
-     */
-    public void setListenForAllRecords(boolean listenForAllRecords) {
-        this.listenForAllRecords = listenForAllRecords;
-    }
-
-    /**
-     * Extracts text from an Excel Workbook writing the extracted content
-     * to the specified {@link Appendable}.
-     *
-     * @param filesystem POI file system
-     * @throws IOException if an error occurs processing the workbook
-     * or writing the extracted content
-     */
-    protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
-            throws IOException, SAXException {
-        log.debug("Starting listenForAllRecords=" + listenForAllRecords);
-
-        // Set up listener and register the records we want to process
-        TikaHSSFListener listener = new TikaHSSFListener(xhtml);
-        HSSFRequest hssfRequest = new HSSFRequest();
-        if (listenForAllRecords) {
-            hssfRequest.addListenerForAllRecords(listener);
-        } else {
-            hssfRequest.addListener(listener, BOFRecord.sid);
-            hssfRequest.addListener(listener, EOFRecord.sid);
-            hssfRequest.addListener(listener, DateWindow1904Record.sid);
-            hssfRequest.addListener(listener, CountryRecord.sid);
-            hssfRequest.addListener(listener, BoundSheetRecord.sid);
-            hssfRequest.addListener(listener, FormatRecord.sid);
-            hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
-            hssfRequest.addListener(listener, SSTRecord.sid);
-            hssfRequest.addListener(listener, FormulaRecord.sid);
-            hssfRequest.addListener(listener, LabelRecord.sid);
-            hssfRequest.addListener(listener, LabelSSTRecord.sid);
-            hssfRequest.addListener(listener, NumberRecord.sid);
-            hssfRequest.addListener(listener, RKRecord.sid);
-            hssfRequest.addListener(listener, HyperlinkRecord.sid);
-        }
-
-        // Create event factory and process Workbook (fire events)
-        DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
-        HSSFEventFactory eventFactory = new HSSFEventFactory();
-
-        eventFactory.processEvents(hssfRequest, documentInputStream);
-        listener.throwStoredException();
-    }
-
-    // ======================================================================
-
-    /**
-     * HSSF Listener implementation which processes the HSSF records.
-     */
-    private static class TikaHSSFListener implements HSSFListener, Serializable {
-
-        /**
-         * XHTML content handler to which the document content is rendered.
-         */
-        private final XHTMLContentHandler handler;
-
-        /**
-         * Potential exception thrown by the content handler. When set to
-         * non-<code>null</code>, causes all subsequent HSSF records to be
-         * ignored and the stored exception to be thrown when
-         * {@link #throwStoredException()} is invoked.
-         */
-        private SAXException exception = null;
-
-        private SSTRecord sstRecord;
-
-        /**
-         * List of worksheet names.
-         */
-        private List<String> sheetNames = new ArrayList<String>();
-
-        /**
-         * Index of the current worksheet within the workbook.
-         * Used to find the worksheet name in the {@link #sheetNames} list.
-         */
-        private short currentSheetIndex;
-
-        /**
-         * Content of the current worksheet, or <code>null</code> if no
-         * worksheet is currently active.
-         */
-        private SortedMap<Point, Cell> currentSheet = null;
-
-        /**
-         * Contstruct a new listener instance outputting parsed data to
-         * the specified XHTML content handler.
-         *
-         * @param handler Destination to write the parsed output to
-         */
-        private TikaHSSFListener(XHTMLContentHandler handler) {
-            this.handler = handler;
-        }
-
-        /**
-         * Process a HSSF record.
-         *
-         * @param record HSSF Record
-         */
-        public void processRecord(Record record) {
-            if (exception == null) {
-                try {
-                    if (log.isDebugEnabled()) {
-                        log.debug(record.toString());
-                    }
-                    internalProcessRecord(record);
-                } catch (SAXException e) {
-                    exception = e;
-                }
-            }
-        }
-
-        public void throwStoredException() throws SAXException {
-            if (exception != null) {
-                throw exception;
-            }
-        }
-
-        private void internalProcessRecord(Record record) throws SAXException {
-            switch (record.getSid()) {
-            case BOFRecord.sid: // start of workbook, worksheet etc. records
-                BOFRecord bof = (BOFRecord) record;
-                if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
-                    currentSheetIndex = -1;
-                } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
-                    currentSheetIndex++;
-                    currentSheet =
-                        new TreeMap<Point, Cell>(new PointComparator());
-                }
-                break;
-
-            case EOFRecord.sid: // end of workbook, worksheet etc. records
-                if (currentSheet != null && !currentSheet.isEmpty()) {
-                    processSheet();
-                }
-                currentSheet = null;
-                break;
-
-            case BoundSheetRecord.sid: // Worksheet index record
-                BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
-                sheetNames.add(boundSheetRecord.getSheetname());
-                break;
-
-            case SSTRecord.sid: // holds all the strings for LabelSSTRecords
-                sstRecord = (SSTRecord) record;
-                break;
-
-            case FormulaRecord.sid: // Cell value from a formula
-                FormulaRecord formula = (FormulaRecord) record;
-                addCell(record, new NumberCell(formula.getValue()));
-                break;
-
-            case LabelRecord.sid: // strings stored directly in the cell
-                LabelRecord label = (LabelRecord) record;
-                addTextCell(record, label.getValue());
-                break;
-
-            case LabelSSTRecord.sid: // Ref. a string in the shared string table
-                LabelSSTRecord sst = (LabelSSTRecord) record;
-                UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
-                addTextCell(record, unicode.getString());
-                break;
-
-            case NumberRecord.sid: // Contains a numeric cell value
-                NumberRecord number = (NumberRecord) record;
-                addCell(record, new NumberCell(number.getValue()));
-                break;
-
-            case RKRecord.sid: // Excel internal number record
-                RKRecord rk = (RKRecord) record;
-                addCell(record, new NumberCell(rk.getRKNumber()));
-                break;
-
-            case HyperlinkRecord.sid: // holds a URL associated with a cell
-                if (currentSheet != null) {
-                    HyperlinkRecord link = (HyperlinkRecord) record;
-                    Point point =
-                        new Point(link.getFirstColumn(), link.getFirstRow());
-                    Cell cell = currentSheet.get(point);
-                    if (cell != null) {
-                        addCell(record, new LinkedCell(cell, link.getAddress()));
-                    }
-                }
-                break;
-            }
-        }
-
-        /**
-         * Adds the given cell (unless <code>null</code>) to the current
-         * worksheet (if any) at the position (if any) of the given record.
-         *
-         * @param record record that holds the cell value
-         * @param cell cell value (or <code>null</code>)
-         */
-        private void addCell(Record record, Cell cell) {
-            if (currentSheet == null) {
-                // Ignore cells outside sheets
-            } else if (cell == null) {
-                // Ignore empty cells
-            } else if (record instanceof CellValueRecordInterface) {
-                CellValueRecordInterface value =
-                    (CellValueRecordInterface) record;
-                Point point = new Point(value.getColumn(), value.getRow());
-                currentSheet.put(point, cell);
-            }
-        }
-
-        /**
-         * Adds a text cell with the given text comment. The given text
-         * is trimmed, and ignored if <code>null</code> or empty.
-         *
-         * @param record record that holds the text value
-         * @param text text content, may be <code>null</code>
-         */
-        private void addTextCell(Record record, String text) {
-            if (text != null) {
-                text = text.trim();
-                if (text.length() > 0) {
-                    addCell(record, new TextCell(text));
-                }
-            }
-        }
-
-        /**
-         * Process an excel sheet.
-         *
-         * @throws SAXException if an error occurs
-         */
-        private void processSheet() throws SAXException {
-            // Sheet Start
-            handler.startElement("div", "class", "page");
-            if (currentSheetIndex < sheetNames.size()) {
-                handler.element("h1", sheetNames.get(currentSheetIndex));
-            }
-            handler.characters("\n");
-            handler.startElement("table");
-            handler.startElement("tbody");
-
-            // Process Rows
-            int currentRow = 1;
-            int currentColumn = 1;
-            handler.startElement("tr");
-            handler.startElement("td");
-            for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
-                while (currentRow < entry.getKey().y) {
-                    handler.endElement("td");
-                    handler.endElement("tr");
-                    handler.characters("\n");
-                    handler.startElement("tr");
-                    handler.startElement("td");
-                    currentRow++;
-                    currentColumn = 1;
-                }
-
-                while (currentColumn < entry.getKey().x) {
-                    handler.endElement("td");
-                    handler.characters("\t");
-                    handler.startElement("td");
-                    currentColumn++;
-                }
-
-                entry.getValue().render(handler);
-            }
-            handler.endElement("td");
-            handler.endElement("tr");
-            
-            // Sheet End
-            handler.endElement("tbody");
-            handler.endElement("table");
-            handler.endElement("div");
-            handler.characters("\n");
-        }
-    }
-
-    /**
-     * Utility comparator for points.
-     */
-    private static class PointComparator implements Comparator<Point> {
-
-        public int compare(Point a, Point b) {
-            int diff = a.y - b.y;
-            if (diff == 0) {
-                diff = a.x - b.x;
-            }
-            return diff;
-        }
-
-    }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.awt.Point;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.BoundSheetRecord;
+import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.CountryRecord;
+import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.EOFRecord;
+import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FormatRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.HyperlinkRecord;
+import org.apache.poi.hssf.record.UnicodeString;
+//import org.apache.poi.hssf.record.HyperlinkRecord;  // FIXME - requires POI release
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.RKRecord;
+import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Excel parser implementation which uses POI's Event API
+ * to handle the contents of a Workbook.
+ * <p>
+ * The Event API uses a much smaller memory footprint than
+ * <code>HSSFWorkbook</code> when processing excel files
+ * but at the cost of more complexity.
+ * <p>
+ * With the Event API a <i>listener</i> is registered for
+ * specific record types and those records are created,
+ * fired off to the listener and then discarded as the stream
+ * is being processed.
+ *
+ * @see org.apache.poi.hssf.eventusermodel.HSSFListener
+ * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
+ * POI Event API How To</a>
+ */
+public class ExcelExtractor {
+
+    /** Logging instance */
+    private static final Log log = LogFactory.getLog(ExcelExtractor.class);
+
+    /**
+     * <code>true</code> if the HSSFListener should be registered
+     * to listen for all records or <code>false</code> (the default)
+     * if the listener should be configured to only receive specified
+     * records.
+     */
+    private boolean listenForAllRecords = false;
+
+    /**
+     * Returns <code>true</code> if this parser is configured to listen
+     * for all records instead of just the specified few.
+     */
+    public boolean isListenForAllRecords() {
+        return listenForAllRecords;
+    }
+
+    /**
+     * Specifies whether this parser should to listen for all
+     * records or just for the specified few.
+     * <p>
+     * <strong>Note:</strong> Under normal operation this setting should
+     * be <code>false</code> (the default), but you can experiment with
+     * this setting for testing and debugging purposes.
+     *
+     * @param listenForAllRecords <code>true</code> if the HSSFListener
+     * should be registered to listen for all records or <code>false</code>
+     * if the listener should be configured to only receive specified records.
+     */
+    public void setListenForAllRecords(boolean listenForAllRecords) {
+        this.listenForAllRecords = listenForAllRecords;
+    }
+
+    /**
+     * Extracts text from an Excel Workbook writing the extracted content
+     * to the specified {@link Appendable}.
+     *
+     * @param filesystem POI file system
+     * @throws IOException if an error occurs processing the workbook
+     * or writing the extracted content
+     */
+    protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            throws IOException, SAXException {
+        log.debug("Starting listenForAllRecords=" + listenForAllRecords);
+
+        // Set up listener and register the records we want to process
+        TikaHSSFListener listener = new TikaHSSFListener(xhtml);
+        HSSFRequest hssfRequest = new HSSFRequest();
+        if (listenForAllRecords) {
+            hssfRequest.addListenerForAllRecords(listener);
+        } else {
+            hssfRequest.addListener(listener, BOFRecord.sid);
+            hssfRequest.addListener(listener, EOFRecord.sid);
+            hssfRequest.addListener(listener, DateWindow1904Record.sid);
+            hssfRequest.addListener(listener, CountryRecord.sid);
+            hssfRequest.addListener(listener, BoundSheetRecord.sid);
+            hssfRequest.addListener(listener, FormatRecord.sid);
+            hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
+            hssfRequest.addListener(listener, SSTRecord.sid);
+            hssfRequest.addListener(listener, FormulaRecord.sid);
+            hssfRequest.addListener(listener, LabelRecord.sid);
+            hssfRequest.addListener(listener, LabelSSTRecord.sid);
+            hssfRequest.addListener(listener, NumberRecord.sid);
+            hssfRequest.addListener(listener, RKRecord.sid);
+            hssfRequest.addListener(listener, HyperlinkRecord.sid);
+        }
+
+        // Create event factory and process Workbook (fire events)
+        DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
+        HSSFEventFactory eventFactory = new HSSFEventFactory();
+
+        eventFactory.processEvents(hssfRequest, documentInputStream);
+        listener.throwStoredException();
+    }
+
+    // ======================================================================
+
+    /**
+     * HSSF Listener implementation which processes the HSSF records.
+     */
+    private static class TikaHSSFListener implements HSSFListener, Serializable {
+
+        /**
+         * XHTML content handler to which the document content is rendered.
+         */
+        private final XHTMLContentHandler handler;
+
+        /**
+         * Potential exception thrown by the content handler. When set to
+         * non-<code>null</code>, causes all subsequent HSSF records to be
+         * ignored and the stored exception to be thrown when
+         * {@link #throwStoredException()} is invoked.
+         */
+        private SAXException exception = null;
+
+        private SSTRecord sstRecord;
+
+        /**
+         * List of worksheet names.
+         */
+        private List<String> sheetNames = new ArrayList<String>();
+
+        /**
+         * Index of the current worksheet within the workbook.
+         * Used to find the worksheet name in the {@link #sheetNames} list.
+         */
+        private short currentSheetIndex;
+
+        /**
+         * Content of the current worksheet, or <code>null</code> if no
+         * worksheet is currently active.
+         */
+        private SortedMap<Point, Cell> currentSheet = null;
+
+        /**
+         * Contstruct a new listener instance outputting parsed data to
+         * the specified XHTML content handler.
+         *
+         * @param handler Destination to write the parsed output to
+         */
+        private TikaHSSFListener(XHTMLContentHandler handler) {
+            this.handler = handler;
+        }
+
+        /**
+         * Process a HSSF record.
+         *
+         * @param record HSSF Record
+         */
+        public void processRecord(Record record) {
+            if (exception == null) {
+                try {
+                    if (log.isDebugEnabled()) {
+                        log.debug(record.toString());
+                    }
+                    internalProcessRecord(record);
+                } catch (SAXException e) {
+                    exception = e;
+                }
+            }
+        }
+
+        public void throwStoredException() throws SAXException {
+            if (exception != null) {
+                throw exception;
+            }
+        }
+
+        private void internalProcessRecord(Record record) throws SAXException {
+            switch (record.getSid()) {
+            case BOFRecord.sid: // start of workbook, worksheet etc. records
+                BOFRecord bof = (BOFRecord) record;
+                if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
+                    currentSheetIndex = -1;
+                } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
+                    currentSheetIndex++;
+                    currentSheet =
+                        new TreeMap<Point, Cell>(new PointComparator());
+                }
+                break;
+
+            case EOFRecord.sid: // end of workbook, worksheet etc. records
+                if (currentSheet != null && !currentSheet.isEmpty()) {
+                    processSheet();
+                }
+                currentSheet = null;
+                break;
+
+            case BoundSheetRecord.sid: // Worksheet index record
+                BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
+                sheetNames.add(boundSheetRecord.getSheetname());
+                break;
+
+            case SSTRecord.sid: // holds all the strings for LabelSSTRecords
+                sstRecord = (SSTRecord) record;
+                break;
+
+            case FormulaRecord.sid: // Cell value from a formula
+                FormulaRecord formula = (FormulaRecord) record;
+                addCell(record, new NumberCell(formula.getValue()));
+                break;
+
+            case LabelRecord.sid: // strings stored directly in the cell
+                LabelRecord label = (LabelRecord) record;
+                addTextCell(record, label.getValue());
+                break;
+
+            case LabelSSTRecord.sid: // Ref. a string in the shared string table
+                LabelSSTRecord sst = (LabelSSTRecord) record;
+                UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
+                addTextCell(record, unicode.getString());
+                break;
+
+            case NumberRecord.sid: // Contains a numeric cell value
+                NumberRecord number = (NumberRecord) record;
+                addCell(record, new NumberCell(number.getValue()));
+                break;
+
+            case RKRecord.sid: // Excel internal number record
+                RKRecord rk = (RKRecord) record;
+                addCell(record, new NumberCell(rk.getRKNumber()));
+                break;
+
+            case HyperlinkRecord.sid: // holds a URL associated with a cell
+                if (currentSheet != null) {
+                    HyperlinkRecord link = (HyperlinkRecord) record;
+                    Point point =
+                        new Point(link.getFirstColumn(), link.getFirstRow());
+                    Cell cell = currentSheet.get(point);
+                    if (cell != null) {
+                        addCell(record, new LinkedCell(cell, link.getAddress()));
+                    }
+                }
+                break;
+            }
+        }
+
+        /**
+         * Adds the given cell (unless <code>null</code>) to the current
+         * worksheet (if any) at the position (if any) of the given record.
+         *
+         * @param record record that holds the cell value
+         * @param cell cell value (or <code>null</code>)
+         */
+        private void addCell(Record record, Cell cell) {
+            if (currentSheet == null) {
+                // Ignore cells outside sheets
+            } else if (cell == null) {
+                // Ignore empty cells
+            } else if (record instanceof CellValueRecordInterface) {
+                CellValueRecordInterface value =
+                    (CellValueRecordInterface) record;
+                Point point = new Point(value.getColumn(), value.getRow());
+                currentSheet.put(point, cell);
+            }
+        }
+
+        /**
+         * Adds a text cell with the given text comment. The given text
+         * is trimmed, and ignored if <code>null</code> or empty.
+         *
+         * @param record record that holds the text value
+         * @param text text content, may be <code>null</code>
+         */
+        private void addTextCell(Record record, String text) {
+            if (text != null) {
+                text = text.trim();
+                if (text.length() > 0) {
+                    addCell(record, new TextCell(text));
+                }
+            }
+        }
+
+        /**
+         * Process an excel sheet.
+         *
+         * @throws SAXException if an error occurs
+         */
+        private void processSheet() throws SAXException {
+            // Sheet Start
+            handler.startElement("div", "class", "page");
+            if (currentSheetIndex < sheetNames.size()) {
+                handler.element("h1", sheetNames.get(currentSheetIndex));
+            }
+            handler.characters("\n");
+            handler.startElement("table");
+            handler.startElement("tbody");
+
+            // Process Rows
+            int currentRow = 1;
+            int currentColumn = 1;
+            handler.startElement("tr");
+            handler.startElement("td");
+            for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
+                while (currentRow < entry.getKey().y) {
+                    handler.endElement("td");
+                    handler.endElement("tr");
+                    handler.characters("\n");
+                    handler.startElement("tr");
+                    handler.startElement("td");
+                    currentRow++;
+                    currentColumn = 1;
+                }
+
+                while (currentColumn < entry.getKey().x) {
+                    handler.endElement("td");
+                    handler.characters("\t");
+                    handler.startElement("td");
+                    currentColumn++;
+                }
+
+                entry.getValue().render(handler);
+            }
+            handler.endElement("td");
+            handler.endElement("tr");
+            
+            // Sheet End
+            handler.endElement("tbody");
+            handler.endElement("table");
+            handler.endElement("div");
+            handler.characters("\n");
+        }
+    }
+
+    /**
+     * Utility comparator for points.
+     */
+    private static class PointComparator implements Comparator<Point> {
+
+        public int compare(Point a, Point b) {
+            int diff = a.y - b.y;
+            if (diff == 0) {
+                diff = a.x - b.x;
+            }
+            return diff;
+        }
+
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java Sun Nov 30 15:15:09 2008
@@ -1,40 +1,40 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Linked cell. This class decorates another content cell with a hyperlink.
- */
-public class LinkedCell extends CellDecorator {
-
-    private final String link;
-
-    public LinkedCell(Cell cell, String link) {
-        super(cell);
-        this.link = link;
-    }
-
-    public void render(XHTMLContentHandler handler) throws SAXException {
-        handler.startElement("a", "href", link);
-        super.render(handler);
-        handler.endElement("a");
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Linked cell. This class decorates another content cell with a hyperlink.
+ */
+public class LinkedCell extends CellDecorator {
+
+    private final String link;
+
+    public LinkedCell(Cell cell, String link) {
+        super(cell);
+        this.link = link;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.startElement("a", "href", link);
+        super.render(handler);
+        handler.endElement("a");
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java Sun Nov 30 15:15:09 2008
@@ -1,46 +1,46 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.text.NumberFormat;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Number cell.
- */
-public class NumberCell implements Cell {
-
-    private final double number;
-
-    private final NumberFormat format;
-
-    public NumberCell(double number, NumberFormat format) {
-        this.number = number;
-        this.format = format;
-    }
-
-    public NumberCell(double number) {
-        this(number, NumberFormat.getInstance());
-    }
-
-    public void render(XHTMLContentHandler handler) throws SAXException {
-        handler.characters(format.format(number));
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.text.NumberFormat;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Number cell.
+ */
+public class NumberCell implements Cell {
+
+    private final double number;
+
+    private final NumberFormat format;
+
+    public NumberCell(double number, NumberFormat format) {
+        this.number = number;
+        this.format = format;
+    }
+
+    public NumberCell(double number) {
+        this(number, NumberFormat.getInstance());
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.characters(format.format(number));
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sun Nov 30 15:15:09 2008
@@ -1,97 +1,97 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.IOException;
-
-import org.apache.poi.hsmf.datatypes.Chunks;
-import org.apache.poi.hsmf.datatypes.StringChunk;
-import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
-import org.apache.poi.hsmf.parsers.POIFSChunkParser;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Outlook Message Parser.
- */
-class OutlookExtractor {
-
-    private static final Chunks CHUNKS = Chunks.getInstance();
-
-    private final POIFSChunkParser parser;
-
-    public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
-        try {
-            this.parser = new POIFSChunkParser(filesystem);
-        } catch (IOException e) {
-            throw new TikaException("Failed to parse Outlook chunks", e);
-        }
-    }
-
-    public void parse(XHTMLContentHandler xhtml, Metadata metadata)
-            throws TikaException, SAXException {
-        String subject = getChunk(CHUNKS.subjectChunk);
-        String from = getChunk(CHUNKS.displayFromChunk);
-
-        metadata.set(Metadata.AUTHOR, from);
-        metadata.set(Metadata.TITLE, subject);
-        metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
-
-        xhtml.element("h1", subject);
-        xhtml.characters("\n");
-
-        xhtml.startElement("dl");
-        header(xhtml, "From", from);
-        header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
-        header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
-        header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
-        xhtml.endElement("dl");
-        xhtml.characters("\n");
-
-        xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
-    }
-
-    private void header(XHTMLContentHandler xhtml, String key, String value)
-            throws SAXException {
-        if (value.length() > 0) {
-            xhtml.element("dt", key);
-            xhtml.characters("\t");
-            xhtml.element("dd", value);
-            xhtml.characters("\n");
-        }
-    }
-
-    /**
-     * Returns the content of the identified string chunk in the
-     * current document. Returns the empty string if the identified
-     * chunk does not exist in the current document.
-     *
-     * @param chunk string chunk identifier
-     * @return content of the identified chunk, or the empty string
-     */
-    private String getChunk(StringChunk chunk) {
-        try {
-            return parser.getDocumentNode(chunk).toString();
-        } catch (ChunkNotFoundException e) {
-            return "";
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.hsmf.parsers.POIFSChunkParser;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Outlook Message Parser.
+ */
+class OutlookExtractor {
+
+    private static final Chunks CHUNKS = Chunks.getInstance();
+
+    private final POIFSChunkParser parser;
+
+    public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
+        try {
+            this.parser = new POIFSChunkParser(filesystem);
+        } catch (IOException e) {
+            throw new TikaException("Failed to parse Outlook chunks", e);
+        }
+    }
+
+    public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+            throws TikaException, SAXException {
+        String subject = getChunk(CHUNKS.subjectChunk);
+        String from = getChunk(CHUNKS.displayFromChunk);
+
+        metadata.set(Metadata.AUTHOR, from);
+        metadata.set(Metadata.TITLE, subject);
+        metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
+
+        xhtml.element("h1", subject);
+        xhtml.characters("\n");
+
+        xhtml.startElement("dl");
+        header(xhtml, "From", from);
+        header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
+        header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
+        header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
+        xhtml.endElement("dl");
+        xhtml.characters("\n");
+
+        xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
+    }
+
+    private void header(XHTMLContentHandler xhtml, String key, String value)
+            throws SAXException {
+        if (value.length() > 0) {
+            xhtml.element("dt", key);
+            xhtml.characters("\t");
+            xhtml.element("dd", value);
+            xhtml.characters("\n");
+        }
+    }
+
+    /**
+     * Returns the content of the identified string chunk in the
+     * current document. Returns the empty string if the identified
+     * chunk does not exist in the current document.
+     *
+     * @param chunk string chunk identifier
+     * @return content of the identified chunk, or the empty string
+     */
+    private String getChunk(StringChunk chunk) {
+        try {
+            return parser.getDocumentNode(chunk).toString();
+        } catch (ChunkNotFoundException e) {
+            return "";
+        }
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Sun Nov 30 15:15:09 2008
@@ -1,37 +1,37 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Text cell.
- */
-public class TextCell implements Cell {
-
-    private final String text;
-
-    public TextCell(String text) {
-        this.text = text;
-    }
-
-    public void render(XHTMLContentHandler handler) throws SAXException {
-        handler.characters(text);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Text cell.
+ */
+public class TextCell implements Cell {
+
+    private final String text;
+
+    public TextCell(String text) {
+        this.text = text;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.characters(text);
+    }
+
+}

Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
------------------------------------------------------------------------------
    svn:eol-style = native