You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/12/01 00:15:11 UTC
svn commit: r721926 [2/5] - in /lucene/tika/trunk/src:
main/java/org/apache/tika/cli/ main/java/org/apache/tika/gui/
main/java/org/apache/tika/mime/ main/java/org/apache/tika/parser/
main/java/org/apache/tika/parser/audio/ main/java/org/apache/tika/par...
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java Sun Nov 30 15:15:09 2008
@@ -1,219 +1,219 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PipedReader;
-import java.io.PipedWriter;
-import java.io.Reader;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
-
-/**
- * Reader for the text content from a given binary stream. This class
- * starts a background thread and uses a {@link Parser}
- * ({@link AutoDetectParser) by default) to parse the text content from
- * a given input stream. The {@link BodyContentHandler} class and a pipe
- * is used to convert the push-based SAX event stream to the pull-based
- * character stream defined by the {@link Reader} interface.
- *
- * @since Apache Tika 0.2
- */
-public class ParsingReader extends Reader {
-
- /**
- * Parser instance used for parsing the given binary stream.
- */
- private final Parser parser;
-
- /**
- * Read end of the pipe.
- */
- private final PipedReader reader;
-
- /**
- * Write end of the pipe.
- */
- private final PipedWriter writer;
-
- /**
- * The binary stream being parsed.
- */
- private final InputStream stream;
-
- /**
- * Metadata associated with the document being parsed.
- */
- private final Metadata metadata;
-
- /**
- * An exception (if any) thrown by the parsing thread.
- */
- private Throwable throwable;
-
- /**
- * Utility method that returns a {@link Metadata} instance
- * for a document with the given name.
- *
- * @param name resource name (or <code>null</code>)
- * @return metadata instance
- */
- private static Metadata getMetadata(String name) {
- Metadata metadata = new Metadata();
- if (name != null && name.length() > 0) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, name);
- }
- return metadata;
- }
-
- /**
- * Creates a reader for the text content of the given binary stream.
- *
- * @param stream binary stream
- */
- public ParsingReader(InputStream stream) {
- this(new AutoDetectParser(), stream, new Metadata());
- }
-
- /**
- * Creates a reader for the text content of the given binary stream
- * with the given name.
- *
- * @param stream binary stream
- * @param name document name
- */
- public ParsingReader(InputStream stream, String name) {
- this(new AutoDetectParser(), stream, getMetadata(name));
- }
-
- /**
- * Creates a reader for the text content of the given file.
- *
- * @param file file
- */
- public ParsingReader(File file) throws FileNotFoundException {
- this(new FileInputStream(file), file.getName());
- }
-
- /**
- * Creates a reader for the text content of the given binary stream
- * with the given document metadata. The given parser is used for
- * parsing.
- *
- * @param parser parser instance
- * @param stream binary stream
- * @param metadata document metadata
- */
- public ParsingReader(Parser parser, InputStream stream, Metadata metadata) {
- this.parser = parser;
- this.reader = new PipedReader();
- try {
- this.writer = new PipedWriter(reader);
- } catch (IOException e) {
- throw new IllegalStateException(e); // Should never happen
- }
- this.stream = stream;
- this.metadata = metadata;
-
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name != null) {
- name = "Apache Tika: " + name;
- } else {
- name = "Apache Tika";
- }
- new Thread(new ParsingThread(), name).start();
- }
-
- /**
- * The background parsing thread.
- */
- private class ParsingThread implements Runnable {
-
- /**
- * Parses the given binary stream and writes the text content
- * to the write end of the pipe. Potential exceptions (including
- * the one caused if the read end is closed unexpectedly) are
- * stored before the input stream is closed and processing is stopped.
- */
- public void run() {
- try {
- ContentHandler handler = new BodyContentHandler(writer);
- parser.parse(stream, handler, metadata);
- } catch (Throwable t) {
- throwable = t;
- }
-
- try {
- stream.close();
- } catch (Throwable t) {
- if (throwable == null) {
- throwable = t;
- }
- }
-
- try {
- writer.close();
- } catch (Throwable t) {
- if (throwable == null) {
- throwable = t;
- }
- }
- }
-
- }
-
- /**
- * Reads parsed text from the pipe connected to the parsing thread.
- * Fails if the parsing thread has thrown an exception.
- *
- * @param cbuff character buffer
- * @param off start offset within the buffer
- * @param len maximum number of characters to read
- * @throws IOException if the parsing thread has failed or
- * if for some reason the pipe does not work properly
- */
- @Override
- public int read(char[] cbuf, int off, int len) throws IOException {
- if (throwable instanceof IOException) {
- throw (IOException) throwable;
- } else if (throwable != null) {
- IOException exception = new IOException("");
- exception.initCause(throwable);
- throw exception;
- }
- return reader.read(cbuf, off, len);
- }
-
- /**
- * Closes the read end of the pipe. If the parsing thread is still
- * running, next write to the pipe will fail and cause the thread
- * to stop. Thus there is no need to explicitly terminate the thread.
- *
- * @throws IOException if the pipe can not be closed
- */
- @Override
- public void close() throws IOException {
- reader.close();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PipedReader;
+import java.io.PipedWriter;
+import java.io.Reader;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Reader for the text content from a given binary stream. This class
+ * starts a background thread and uses a {@link Parser}
+ * ({@link AutoDetectParser) by default) to parse the text content from
+ * a given input stream. The {@link BodyContentHandler} class and a pipe
+ * is used to convert the push-based SAX event stream to the pull-based
+ * character stream defined by the {@link Reader} interface.
+ *
+ * @since Apache Tika 0.2
+ */
+public class ParsingReader extends Reader {
+
+ /**
+ * Parser instance used for parsing the given binary stream.
+ */
+ private final Parser parser;
+
+ /**
+ * Read end of the pipe.
+ */
+ private final PipedReader reader;
+
+ /**
+ * Write end of the pipe.
+ */
+ private final PipedWriter writer;
+
+ /**
+ * The binary stream being parsed.
+ */
+ private final InputStream stream;
+
+ /**
+ * Metadata associated with the document being parsed.
+ */
+ private final Metadata metadata;
+
+ /**
+ * An exception (if any) thrown by the parsing thread.
+ */
+ private Throwable throwable;
+
+ /**
+ * Utility method that returns a {@link Metadata} instance
+ * for a document with the given name.
+ *
+ * @param name resource name (or <code>null</code>)
+ * @return metadata instance
+ */
+ private static Metadata getMetadata(String name) {
+ Metadata metadata = new Metadata();
+ if (name != null && name.length() > 0) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+ return metadata;
+ }
+
+ /**
+ * Creates a reader for the text content of the given binary stream.
+ *
+ * @param stream binary stream
+ */
+ public ParsingReader(InputStream stream) {
+ this(new AutoDetectParser(), stream, new Metadata());
+ }
+
+ /**
+ * Creates a reader for the text content of the given binary stream
+ * with the given name.
+ *
+ * @param stream binary stream
+ * @param name document name
+ */
+ public ParsingReader(InputStream stream, String name) {
+ this(new AutoDetectParser(), stream, getMetadata(name));
+ }
+
+ /**
+ * Creates a reader for the text content of the given file.
+ *
+ * @param file file
+ */
+ public ParsingReader(File file) throws FileNotFoundException {
+ this(new FileInputStream(file), file.getName());
+ }
+
+ /**
+ * Creates a reader for the text content of the given binary stream
+ * with the given document metadata. The given parser is used for
+ * parsing.
+ *
+ * @param parser parser instance
+ * @param stream binary stream
+ * @param metadata document metadata
+ */
+ public ParsingReader(Parser parser, InputStream stream, Metadata metadata) {
+ this.parser = parser;
+ this.reader = new PipedReader();
+ try {
+ this.writer = new PipedWriter(reader);
+ } catch (IOException e) {
+ throw new IllegalStateException(e); // Should never happen
+ }
+ this.stream = stream;
+ this.metadata = metadata;
+
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ name = "Apache Tika: " + name;
+ } else {
+ name = "Apache Tika";
+ }
+ new Thread(new ParsingThread(), name).start();
+ }
+
+ /**
+ * The background parsing thread.
+ */
+ private class ParsingThread implements Runnable {
+
+ /**
+ * Parses the given binary stream and writes the text content
+ * to the write end of the pipe. Potential exceptions (including
+ * the one caused if the read end is closed unexpectedly) are
+ * stored before the input stream is closed and processing is stopped.
+ */
+ public void run() {
+ try {
+ ContentHandler handler = new BodyContentHandler(writer);
+ parser.parse(stream, handler, metadata);
+ } catch (Throwable t) {
+ throwable = t;
+ }
+
+ try {
+ stream.close();
+ } catch (Throwable t) {
+ if (throwable == null) {
+ throwable = t;
+ }
+ }
+
+ try {
+ writer.close();
+ } catch (Throwable t) {
+ if (throwable == null) {
+ throwable = t;
+ }
+ }
+ }
+
+ }
+
+ /**
+ * Reads parsed text from the pipe connected to the parsing thread.
+ * Fails if the parsing thread has thrown an exception.
+ *
+ * @param cbuff character buffer
+ * @param off start offset within the buffer
+ * @param len maximum number of characters to read
+ * @throws IOException if the parsing thread has failed or
+ * if for some reason the pipe does not work properly
+ */
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ if (throwable instanceof IOException) {
+ throw (IOException) throwable;
+ } else if (throwable != null) {
+ IOException exception = new IOException("");
+ exception.initCause(throwable);
+ throw exception;
+ }
+ return reader.read(cbuf, off, len);
+ }
+
+ /**
+ * Closes the read end of the pipe. If the parsing thread is still
+ * running, next write to the pipe will fail and cause the thread
+ * to stop. Thus there is no need to explicitly terminate the thread.
+ *
+ * @throws IOException if the pipe can not be closed
+ */
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java Sun Nov 30 15:15:09 2008
@@ -1,77 +1,77 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import javax.xml.XMLConstants;
-
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Content handler decorator that downgrades XHTML elements to
- * old-style HTML elements before passing them on to the decorated
- * content handler. This downgrading consists of dropping all namespaces
- * (and namespaced attributes) and uppercasing all element names.
- * Used by the {@link HtmlParser} to make all incoming HTML look the same.
- */
-class XHTMLDowngradeHandler extends ContentHandlerDecorator {
-
- public XHTMLDowngradeHandler(ContentHandler handler) {
- super(handler);
- }
-
- @Override
- public void startElement(
- String uri, String localName, String name, Attributes atts)
- throws SAXException {
- String upper = localName.toUpperCase();
-
- AttributesImpl attributes = new AttributesImpl();
- for (int i = 0; i < atts.getLength(); i++) {
- String local = atts.getLocalName(i);
- String qname = atts.getQName(i);
- if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
- && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
- && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
- attributes.addAttribute(
- atts.getURI(i), local, qname,
- atts.getType(i), atts.getValue(i));
- }
- }
-
- super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
- }
-
- @Override
- public void endElement(String uri, String localName, String name)
- throws SAXException {
- String upper = localName.toUpperCase();
- super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) {
- }
-
- @Override
- public void endPrefixMapping(String prefix) {
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+ public XHTMLDowngradeHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ String upper = localName.toUpperCase();
+
+ AttributesImpl attributes = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ String local = atts.getLocalName(i);
+ String qname = atts.getQName(i);
+ if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
+ && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+ && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+ attributes.addAttribute(
+ atts.getURI(i), local, qname,
+ atts.getType(i), atts.getValue(i));
+ }
+ }
+
+ super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ String upper = localName.toUpperCase();
+ super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java Sun Nov 30 15:15:09 2008
@@ -1,58 +1,58 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-
-import javax.imageio.ImageIO;
-import javax.imageio.ImageReader;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ImageParser implements Parser {
-
- public void parse(
- InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
- String type = metadata.get(Metadata.CONTENT_TYPE);
- if (type != null) {
- Iterator<ImageReader> iterator =
- ImageIO.getImageReadersByMIMEType(type);
- if (iterator.hasNext()) {
- ImageReader reader = iterator.next();
- reader.setInput(ImageIO.createImageInputStream(
- new CloseShieldInputStream(stream)));
- metadata.set("height", Integer.toString(reader.getHeight(0)));
- metadata.set("width", Integer.toString(reader.getWidth(0)));
- reader.dispose();
- }
- }
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ImageParser implements Parser {
+
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type != null) {
+ Iterator<ImageReader> iterator =
+ ImageIO.getImageReadersByMIMEType(type);
+ if (iterator.hasNext()) {
+ ImageReader reader = iterator.next();
+ reader.setInput(ImageIO.createImageInputStream(
+ new CloseShieldInputStream(stream)));
+ metadata.set("height", Integer.toString(reader.getHeight(0)));
+ metadata.set("width", Integer.toString(reader.getWidth(0)));
+ reader.dispose();
+ }
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/image/ImageParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java Sun Nov 30 15:15:09 2008
@@ -1,38 +1,38 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Cell of content. Classes that implement this interface are used by
- * Tika parsers (currently just the MS Excel parser) to keep track of
- * individual pieces of content before they are rendered to the XHTML
- * SAX event stream.
- */
-public interface Cell {
-
- /**
- * Renders the content to the given XHTML SAX event stream.
- *
- * @param handler
- * @throws SAXException
- */
- void render(XHTMLContentHandler handler) throws SAXException;
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+ /**
+ * Renders the content to the given XHTML SAX event stream.
+ *
+ * @param handler
+ * @throws SAXException
+ */
+ void render(XHTMLContentHandler handler) throws SAXException;
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Sun Nov 30 15:15:09 2008
@@ -1,37 +1,37 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Cell decorator.
- */
-public class CellDecorator implements Cell {
-
- private final Cell cell;
-
- public CellDecorator(Cell cell) {
- this.cell = cell;
- }
-
- public void render(XHTMLContentHandler handler) throws SAXException {
- cell.render(handler);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+ private final Cell cell;
+
+ public CellDecorator(Cell cell) {
+ this.cell = cell;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ cell.render(handler);
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Sun Nov 30 15:15:09 2008
@@ -1,397 +1,397 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.awt.Point;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Map;
-import java.util.SortedMap;
-import java.util.TreeMap;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
-import org.apache.poi.hssf.eventusermodel.HSSFListener;
-import org.apache.poi.hssf.eventusermodel.HSSFRequest;
-import org.apache.poi.hssf.record.BOFRecord;
-import org.apache.poi.hssf.record.BoundSheetRecord;
-import org.apache.poi.hssf.record.CellValueRecordInterface;
-import org.apache.poi.hssf.record.CountryRecord;
-import org.apache.poi.hssf.record.DateWindow1904Record;
-import org.apache.poi.hssf.record.EOFRecord;
-import org.apache.poi.hssf.record.ExtendedFormatRecord;
-import org.apache.poi.hssf.record.FormatRecord;
-import org.apache.poi.hssf.record.FormulaRecord;
-import org.apache.poi.hssf.record.HyperlinkRecord;
-import org.apache.poi.hssf.record.UnicodeString;
-//import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI release
-import org.apache.poi.hssf.record.LabelRecord;
-import org.apache.poi.hssf.record.LabelSSTRecord;
-import org.apache.poi.hssf.record.NumberRecord;
-import org.apache.poi.hssf.record.RKRecord;
-import org.apache.poi.hssf.record.Record;
-import org.apache.poi.hssf.record.SSTRecord;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Excel parser implementation which uses POI's Event API
- * to handle the contents of a Workbook.
- * <p>
- * The Event API uses a much smaller memory footprint than
- * <code>HSSFWorkbook</code> when processing excel files
- * but at the cost of more complexity.
- * <p>
- * With the Event API a <i>listener</i> is registered for
- * specific record types and those records are created,
- * fired off to the listener and then discarded as the stream
- * is being processed.
- *
- * @see org.apache.poi.hssf.eventusermodel.HSSFListener
- * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
- * POI Event API How To</a>
- */
-public class ExcelExtractor {
-
- /** Logging instance */
- private static final Log log = LogFactory.getLog(ExcelExtractor.class);
-
- /**
- * <code>true</code> if the HSSFListener should be registered
- * to listen for all records or <code>false</code> (the default)
- * if the listener should be configured to only receive specified
- * records.
- */
- private boolean listenForAllRecords = false;
-
- /**
- * Returns <code>true</code> if this parser is configured to listen
- * for all records instead of just the specified few.
- */
- public boolean isListenForAllRecords() {
- return listenForAllRecords;
- }
-
- /**
- * Specifies whether this parser should to listen for all
- * records or just for the specified few.
- * <p>
- * <strong>Note:</strong> Under normal operation this setting should
- * be <code>false</code> (the default), but you can experiment with
- * this setting for testing and debugging purposes.
- *
- * @param listenForAllRecords <code>true</code> if the HSSFListener
- * should be registered to listen for all records or <code>false</code>
- * if the listener should be configured to only receive specified records.
- */
- public void setListenForAllRecords(boolean listenForAllRecords) {
- this.listenForAllRecords = listenForAllRecords;
- }
-
- /**
- * Extracts text from an Excel Workbook writing the extracted content
- * to the specified {@link Appendable}.
- *
- * @param filesystem POI file system
- * @throws IOException if an error occurs processing the workbook
- * or writing the extracted content
- */
- protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
- throws IOException, SAXException {
- log.debug("Starting listenForAllRecords=" + listenForAllRecords);
-
- // Set up listener and register the records we want to process
- TikaHSSFListener listener = new TikaHSSFListener(xhtml);
- HSSFRequest hssfRequest = new HSSFRequest();
- if (listenForAllRecords) {
- hssfRequest.addListenerForAllRecords(listener);
- } else {
- hssfRequest.addListener(listener, BOFRecord.sid);
- hssfRequest.addListener(listener, EOFRecord.sid);
- hssfRequest.addListener(listener, DateWindow1904Record.sid);
- hssfRequest.addListener(listener, CountryRecord.sid);
- hssfRequest.addListener(listener, BoundSheetRecord.sid);
- hssfRequest.addListener(listener, FormatRecord.sid);
- hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
- hssfRequest.addListener(listener, SSTRecord.sid);
- hssfRequest.addListener(listener, FormulaRecord.sid);
- hssfRequest.addListener(listener, LabelRecord.sid);
- hssfRequest.addListener(listener, LabelSSTRecord.sid);
- hssfRequest.addListener(listener, NumberRecord.sid);
- hssfRequest.addListener(listener, RKRecord.sid);
- hssfRequest.addListener(listener, HyperlinkRecord.sid);
- }
-
- // Create event factory and process Workbook (fire events)
- DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
- HSSFEventFactory eventFactory = new HSSFEventFactory();
-
- eventFactory.processEvents(hssfRequest, documentInputStream);
- listener.throwStoredException();
- }
-
- // ======================================================================
-
- /**
- * HSSF Listener implementation which processes the HSSF records.
- */
- private static class TikaHSSFListener implements HSSFListener, Serializable {
-
- /**
- * XHTML content handler to which the document content is rendered.
- */
- private final XHTMLContentHandler handler;
-
- /**
- * Potential exception thrown by the content handler. When set to
- * non-<code>null</code>, causes all subsequent HSSF records to be
- * ignored and the stored exception to be thrown when
- * {@link #throwStoredException()} is invoked.
- */
- private SAXException exception = null;
-
- private SSTRecord sstRecord;
-
- /**
- * List of worksheet names.
- */
- private List<String> sheetNames = new ArrayList<String>();
-
- /**
- * Index of the current worksheet within the workbook.
- * Used to find the worksheet name in the {@link #sheetNames} list.
- */
- private short currentSheetIndex;
-
- /**
- * Content of the current worksheet, or <code>null</code> if no
- * worksheet is currently active.
- */
- private SortedMap<Point, Cell> currentSheet = null;
-
- /**
- * Contstruct a new listener instance outputting parsed data to
- * the specified XHTML content handler.
- *
- * @param handler Destination to write the parsed output to
- */
- private TikaHSSFListener(XHTMLContentHandler handler) {
- this.handler = handler;
- }
-
- /**
- * Process a HSSF record.
- *
- * @param record HSSF Record
- */
- public void processRecord(Record record) {
- if (exception == null) {
- try {
- if (log.isDebugEnabled()) {
- log.debug(record.toString());
- }
- internalProcessRecord(record);
- } catch (SAXException e) {
- exception = e;
- }
- }
- }
-
- public void throwStoredException() throws SAXException {
- if (exception != null) {
- throw exception;
- }
- }
-
- private void internalProcessRecord(Record record) throws SAXException {
- switch (record.getSid()) {
- case BOFRecord.sid: // start of workbook, worksheet etc. records
- BOFRecord bof = (BOFRecord) record;
- if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
- currentSheetIndex = -1;
- } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
- currentSheetIndex++;
- currentSheet =
- new TreeMap<Point, Cell>(new PointComparator());
- }
- break;
-
- case EOFRecord.sid: // end of workbook, worksheet etc. records
- if (currentSheet != null && !currentSheet.isEmpty()) {
- processSheet();
- }
- currentSheet = null;
- break;
-
- case BoundSheetRecord.sid: // Worksheet index record
- BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
- sheetNames.add(boundSheetRecord.getSheetname());
- break;
-
- case SSTRecord.sid: // holds all the strings for LabelSSTRecords
- sstRecord = (SSTRecord) record;
- break;
-
- case FormulaRecord.sid: // Cell value from a formula
- FormulaRecord formula = (FormulaRecord) record;
- addCell(record, new NumberCell(formula.getValue()));
- break;
-
- case LabelRecord.sid: // strings stored directly in the cell
- LabelRecord label = (LabelRecord) record;
- addTextCell(record, label.getValue());
- break;
-
- case LabelSSTRecord.sid: // Ref. a string in the shared string table
- LabelSSTRecord sst = (LabelSSTRecord) record;
- UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
- addTextCell(record, unicode.getString());
- break;
-
- case NumberRecord.sid: // Contains a numeric cell value
- NumberRecord number = (NumberRecord) record;
- addCell(record, new NumberCell(number.getValue()));
- break;
-
- case RKRecord.sid: // Excel internal number record
- RKRecord rk = (RKRecord) record;
- addCell(record, new NumberCell(rk.getRKNumber()));
- break;
-
- case HyperlinkRecord.sid: // holds a URL associated with a cell
- if (currentSheet != null) {
- HyperlinkRecord link = (HyperlinkRecord) record;
- Point point =
- new Point(link.getFirstColumn(), link.getFirstRow());
- Cell cell = currentSheet.get(point);
- if (cell != null) {
- addCell(record, new LinkedCell(cell, link.getAddress()));
- }
- }
- break;
- }
- }
-
- /**
- * Adds the given cell (unless <code>null</code>) to the current
- * worksheet (if any) at the position (if any) of the given record.
- *
- * @param record record that holds the cell value
- * @param cell cell value (or <code>null</code>)
- */
- private void addCell(Record record, Cell cell) {
- if (currentSheet == null) {
- // Ignore cells outside sheets
- } else if (cell == null) {
- // Ignore empty cells
- } else if (record instanceof CellValueRecordInterface) {
- CellValueRecordInterface value =
- (CellValueRecordInterface) record;
- Point point = new Point(value.getColumn(), value.getRow());
- currentSheet.put(point, cell);
- }
- }
-
- /**
- * Adds a text cell with the given text comment. The given text
- * is trimmed, and ignored if <code>null</code> or empty.
- *
- * @param record record that holds the text value
- * @param text text content, may be <code>null</code>
- */
- private void addTextCell(Record record, String text) {
- if (text != null) {
- text = text.trim();
- if (text.length() > 0) {
- addCell(record, new TextCell(text));
- }
- }
- }
-
- /**
- * Process an excel sheet.
- *
- * @throws SAXException if an error occurs
- */
- private void processSheet() throws SAXException {
- // Sheet Start
- handler.startElement("div", "class", "page");
- if (currentSheetIndex < sheetNames.size()) {
- handler.element("h1", sheetNames.get(currentSheetIndex));
- }
- handler.characters("\n");
- handler.startElement("table");
- handler.startElement("tbody");
-
- // Process Rows
- int currentRow = 1;
- int currentColumn = 1;
- handler.startElement("tr");
- handler.startElement("td");
- for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
- while (currentRow < entry.getKey().y) {
- handler.endElement("td");
- handler.endElement("tr");
- handler.characters("\n");
- handler.startElement("tr");
- handler.startElement("td");
- currentRow++;
- currentColumn = 1;
- }
-
- while (currentColumn < entry.getKey().x) {
- handler.endElement("td");
- handler.characters("\t");
- handler.startElement("td");
- currentColumn++;
- }
-
- entry.getValue().render(handler);
- }
- handler.endElement("td");
- handler.endElement("tr");
-
- // Sheet End
- handler.endElement("tbody");
- handler.endElement("table");
- handler.endElement("div");
- handler.characters("\n");
- }
- }
-
- /**
- * Utility comparator for points.
- */
- private static class PointComparator implements Comparator<Point> {
-
- public int compare(Point a, Point b) {
- int diff = a.y - b.y;
- if (diff == 0) {
- diff = a.x - b.x;
- }
- return diff;
- }
-
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.awt.Point;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.BoundSheetRecord;
+import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.CountryRecord;
+import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.EOFRecord;
+import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FormatRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.HyperlinkRecord;
+import org.apache.poi.hssf.record.UnicodeString;
+//import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI release
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.RKRecord;
+import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Excel parser implementation which uses POI's Event API
+ * to handle the contents of a Workbook.
+ * <p>
+ * The Event API uses a much smaller memory footprint than
+ * <code>HSSFWorkbook</code> when processing excel files
+ * but at the cost of more complexity.
+ * <p>
+ * With the Event API a <i>listener</i> is registered for
+ * specific record types and those records are created,
+ * fired off to the listener and then discarded as the stream
+ * is being processed.
+ *
+ * @see org.apache.poi.hssf.eventusermodel.HSSFListener
+ * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
+ * POI Event API How To</a>
+ */
+public class ExcelExtractor {
+
+ /** Logging instance */
+ private static final Log log = LogFactory.getLog(ExcelExtractor.class);
+
+ /**
+ * <code>true</code> if the HSSFListener should be registered
+ * to listen for all records or <code>false</code> (the default)
+ * if the listener should be configured to only receive specified
+ * records.
+ */
+ private boolean listenForAllRecords = false;
+
+ /**
+ * Returns <code>true</code> if this parser is configured to listen
+ * for all records instead of just the specified few.
+ */
+ public boolean isListenForAllRecords() {
+ return listenForAllRecords;
+ }
+
+ /**
+ * Specifies whether this parser should to listen for all
+ * records or just for the specified few.
+ * <p>
+ * <strong>Note:</strong> Under normal operation this setting should
+ * be <code>false</code> (the default), but you can experiment with
+ * this setting for testing and debugging purposes.
+ *
+ * @param listenForAllRecords <code>true</code> if the HSSFListener
+ * should be registered to listen for all records or <code>false</code>
+ * if the listener should be configured to only receive specified records.
+ */
+ public void setListenForAllRecords(boolean listenForAllRecords) {
+ this.listenForAllRecords = listenForAllRecords;
+ }
+
+ /**
+ * Extracts text from an Excel Workbook writing the extracted content
+ * to the specified {@link Appendable}.
+ *
+ * @param filesystem POI file system
+ * @throws IOException if an error occurs processing the workbook
+ * or writing the extracted content
+ */
+ protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ throws IOException, SAXException {
+ log.debug("Starting listenForAllRecords=" + listenForAllRecords);
+
+ // Set up listener and register the records we want to process
+ TikaHSSFListener listener = new TikaHSSFListener(xhtml);
+ HSSFRequest hssfRequest = new HSSFRequest();
+ if (listenForAllRecords) {
+ hssfRequest.addListenerForAllRecords(listener);
+ } else {
+ hssfRequest.addListener(listener, BOFRecord.sid);
+ hssfRequest.addListener(listener, EOFRecord.sid);
+ hssfRequest.addListener(listener, DateWindow1904Record.sid);
+ hssfRequest.addListener(listener, CountryRecord.sid);
+ hssfRequest.addListener(listener, BoundSheetRecord.sid);
+ hssfRequest.addListener(listener, FormatRecord.sid);
+ hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
+ hssfRequest.addListener(listener, SSTRecord.sid);
+ hssfRequest.addListener(listener, FormulaRecord.sid);
+ hssfRequest.addListener(listener, LabelRecord.sid);
+ hssfRequest.addListener(listener, LabelSSTRecord.sid);
+ hssfRequest.addListener(listener, NumberRecord.sid);
+ hssfRequest.addListener(listener, RKRecord.sid);
+ hssfRequest.addListener(listener, HyperlinkRecord.sid);
+ }
+
+ // Create event factory and process Workbook (fire events)
+ DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
+ HSSFEventFactory eventFactory = new HSSFEventFactory();
+
+ eventFactory.processEvents(hssfRequest, documentInputStream);
+ listener.throwStoredException();
+ }
+
+ // ======================================================================
+
+ /**
+ * HSSF Listener implementation which processes the HSSF records.
+ */
+ private static class TikaHSSFListener implements HSSFListener, Serializable {
+
+ /**
+ * XHTML content handler to which the document content is rendered.
+ */
+ private final XHTMLContentHandler handler;
+
+ /**
+ * Potential exception thrown by the content handler. When set to
+ * non-<code>null</code>, causes all subsequent HSSF records to be
+ * ignored and the stored exception to be thrown when
+ * {@link #throwStoredException()} is invoked.
+ */
+ private SAXException exception = null;
+
+ private SSTRecord sstRecord;
+
+ /**
+ * List of worksheet names.
+ */
+ private List<String> sheetNames = new ArrayList<String>();
+
+ /**
+ * Index of the current worksheet within the workbook.
+ * Used to find the worksheet name in the {@link #sheetNames} list.
+ */
+ private short currentSheetIndex;
+
+ /**
+ * Content of the current worksheet, or <code>null</code> if no
+ * worksheet is currently active.
+ */
+ private SortedMap<Point, Cell> currentSheet = null;
+
+ /**
+ * Contstruct a new listener instance outputting parsed data to
+ * the specified XHTML content handler.
+ *
+ * @param handler Destination to write the parsed output to
+ */
+ private TikaHSSFListener(XHTMLContentHandler handler) {
+ this.handler = handler;
+ }
+
+ /**
+ * Process a HSSF record.
+ *
+ * @param record HSSF Record
+ */
+ public void processRecord(Record record) {
+ if (exception == null) {
+ try {
+ if (log.isDebugEnabled()) {
+ log.debug(record.toString());
+ }
+ internalProcessRecord(record);
+ } catch (SAXException e) {
+ exception = e;
+ }
+ }
+ }
+
+ public void throwStoredException() throws SAXException {
+ if (exception != null) {
+ throw exception;
+ }
+ }
+
+ private void internalProcessRecord(Record record) throws SAXException {
+ switch (record.getSid()) {
+ case BOFRecord.sid: // start of workbook, worksheet etc. records
+ BOFRecord bof = (BOFRecord) record;
+ if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
+ currentSheetIndex = -1;
+ } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
+ currentSheetIndex++;
+ currentSheet =
+ new TreeMap<Point, Cell>(new PointComparator());
+ }
+ break;
+
+ case EOFRecord.sid: // end of workbook, worksheet etc. records
+ if (currentSheet != null && !currentSheet.isEmpty()) {
+ processSheet();
+ }
+ currentSheet = null;
+ break;
+
+ case BoundSheetRecord.sid: // Worksheet index record
+ BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
+ sheetNames.add(boundSheetRecord.getSheetname());
+ break;
+
+ case SSTRecord.sid: // holds all the strings for LabelSSTRecords
+ sstRecord = (SSTRecord) record;
+ break;
+
+ case FormulaRecord.sid: // Cell value from a formula
+ FormulaRecord formula = (FormulaRecord) record;
+ addCell(record, new NumberCell(formula.getValue()));
+ break;
+
+ case LabelRecord.sid: // strings stored directly in the cell
+ LabelRecord label = (LabelRecord) record;
+ addTextCell(record, label.getValue());
+ break;
+
+ case LabelSSTRecord.sid: // Ref. a string in the shared string table
+ LabelSSTRecord sst = (LabelSSTRecord) record;
+ UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
+ addTextCell(record, unicode.getString());
+ break;
+
+ case NumberRecord.sid: // Contains a numeric cell value
+ NumberRecord number = (NumberRecord) record;
+ addCell(record, new NumberCell(number.getValue()));
+ break;
+
+ case RKRecord.sid: // Excel internal number record
+ RKRecord rk = (RKRecord) record;
+ addCell(record, new NumberCell(rk.getRKNumber()));
+ break;
+
+ case HyperlinkRecord.sid: // holds a URL associated with a cell
+ if (currentSheet != null) {
+ HyperlinkRecord link = (HyperlinkRecord) record;
+ Point point =
+ new Point(link.getFirstColumn(), link.getFirstRow());
+ Cell cell = currentSheet.get(point);
+ if (cell != null) {
+ addCell(record, new LinkedCell(cell, link.getAddress()));
+ }
+ }
+ break;
+ }
+ }
+
+ /**
+ * Adds the given cell (unless <code>null</code>) to the current
+ * worksheet (if any) at the position (if any) of the given record.
+ *
+ * @param record record that holds the cell value
+ * @param cell cell value (or <code>null</code>)
+ */
+ private void addCell(Record record, Cell cell) {
+ if (currentSheet == null) {
+ // Ignore cells outside sheets
+ } else if (cell == null) {
+ // Ignore empty cells
+ } else if (record instanceof CellValueRecordInterface) {
+ CellValueRecordInterface value =
+ (CellValueRecordInterface) record;
+ Point point = new Point(value.getColumn(), value.getRow());
+ currentSheet.put(point, cell);
+ }
+ }
+
+ /**
+ * Adds a text cell with the given text comment. The given text
+ * is trimmed, and ignored if <code>null</code> or empty.
+ *
+ * @param record record that holds the text value
+ * @param text text content, may be <code>null</code>
+ */
+ private void addTextCell(Record record, String text) {
+ if (text != null) {
+ text = text.trim();
+ if (text.length() > 0) {
+ addCell(record, new TextCell(text));
+ }
+ }
+ }
+
+ /**
+ * Process an excel sheet.
+ *
+ * @throws SAXException if an error occurs
+ */
+ private void processSheet() throws SAXException {
+ // Sheet Start
+ handler.startElement("div", "class", "page");
+ if (currentSheetIndex < sheetNames.size()) {
+ handler.element("h1", sheetNames.get(currentSheetIndex));
+ }
+ handler.characters("\n");
+ handler.startElement("table");
+ handler.startElement("tbody");
+
+ // Process Rows
+ int currentRow = 1;
+ int currentColumn = 1;
+ handler.startElement("tr");
+ handler.startElement("td");
+ for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
+ while (currentRow < entry.getKey().y) {
+ handler.endElement("td");
+ handler.endElement("tr");
+ handler.characters("\n");
+ handler.startElement("tr");
+ handler.startElement("td");
+ currentRow++;
+ currentColumn = 1;
+ }
+
+ while (currentColumn < entry.getKey().x) {
+ handler.endElement("td");
+ handler.characters("\t");
+ handler.startElement("td");
+ currentColumn++;
+ }
+
+ entry.getValue().render(handler);
+ }
+ handler.endElement("td");
+ handler.endElement("tr");
+
+ // Sheet End
+ handler.endElement("tbody");
+ handler.endElement("table");
+ handler.endElement("div");
+ handler.characters("\n");
+ }
+ }
+
+ /**
+ * Utility comparator for points.
+ */
+ private static class PointComparator implements Comparator<Point> {
+
+ public int compare(Point a, Point b) {
+ int diff = a.y - b.y;
+ if (diff == 0) {
+ diff = a.x - b.x;
+ }
+ return diff;
+ }
+
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java Sun Nov 30 15:15:09 2008
@@ -1,40 +1,40 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Linked cell. This class decorates another content cell with a hyperlink.
- */
-public class LinkedCell extends CellDecorator {
-
- private final String link;
-
- public LinkedCell(Cell cell, String link) {
- super(cell);
- this.link = link;
- }
-
- public void render(XHTMLContentHandler handler) throws SAXException {
- handler.startElement("a", "href", link);
- super.render(handler);
- handler.endElement("a");
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Linked cell. This class decorates another content cell with a hyperlink.
+ */
+public class LinkedCell extends CellDecorator {
+
+ private final String link;
+
+ public LinkedCell(Cell cell, String link) {
+ super(cell);
+ this.link = link;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.startElement("a", "href", link);
+ super.render(handler);
+ handler.endElement("a");
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java Sun Nov 30 15:15:09 2008
@@ -1,46 +1,46 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.text.NumberFormat;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Number cell.
- */
-public class NumberCell implements Cell {
-
- private final double number;
-
- private final NumberFormat format;
-
- public NumberCell(double number, NumberFormat format) {
- this.number = number;
- this.format = format;
- }
-
- public NumberCell(double number) {
- this(number, NumberFormat.getInstance());
- }
-
- public void render(XHTMLContentHandler handler) throws SAXException {
- handler.characters(format.format(number));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.text.NumberFormat;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Number cell.
+ */
+public class NumberCell implements Cell {
+
+ private final double number;
+
+ private final NumberFormat format;
+
+ public NumberCell(double number, NumberFormat format) {
+ this.number = number;
+ this.format = format;
+ }
+
+ public NumberCell(double number) {
+ this(number, NumberFormat.getInstance());
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.characters(format.format(number));
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sun Nov 30 15:15:09 2008
@@ -1,97 +1,97 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.IOException;
-
-import org.apache.poi.hsmf.datatypes.Chunks;
-import org.apache.poi.hsmf.datatypes.StringChunk;
-import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
-import org.apache.poi.hsmf.parsers.POIFSChunkParser;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Outlook Message Parser.
- */
-class OutlookExtractor {
-
- private static final Chunks CHUNKS = Chunks.getInstance();
-
- private final POIFSChunkParser parser;
-
- public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
- try {
- this.parser = new POIFSChunkParser(filesystem);
- } catch (IOException e) {
- throw new TikaException("Failed to parse Outlook chunks", e);
- }
- }
-
- public void parse(XHTMLContentHandler xhtml, Metadata metadata)
- throws TikaException, SAXException {
- String subject = getChunk(CHUNKS.subjectChunk);
- String from = getChunk(CHUNKS.displayFromChunk);
-
- metadata.set(Metadata.AUTHOR, from);
- metadata.set(Metadata.TITLE, subject);
- metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
-
- xhtml.element("h1", subject);
- xhtml.characters("\n");
-
- xhtml.startElement("dl");
- header(xhtml, "From", from);
- header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
- header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
- header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
- xhtml.endElement("dl");
- xhtml.characters("\n");
-
- xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
- }
-
- private void header(XHTMLContentHandler xhtml, String key, String value)
- throws SAXException {
- if (value.length() > 0) {
- xhtml.element("dt", key);
- xhtml.characters("\t");
- xhtml.element("dd", value);
- xhtml.characters("\n");
- }
- }
-
- /**
- * Returns the content of the identified string chunk in the
- * current document. Returns the empty string if the identified
- * chunk does not exist in the current document.
- *
- * @param chunk string chunk identifier
- * @return content of the identified chunk, or the empty string
- */
- private String getChunk(StringChunk chunk) {
- try {
- return parser.getDocumentNode(chunk).toString();
- } catch (ChunkNotFoundException e) {
- return "";
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.hsmf.parsers.POIFSChunkParser;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Outlook Message Parser.
+ */
+class OutlookExtractor {
+
+ private static final Chunks CHUNKS = Chunks.getInstance();
+
+ private final POIFSChunkParser parser;
+
+ public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
+ try {
+ this.parser = new POIFSChunkParser(filesystem);
+ } catch (IOException e) {
+ throw new TikaException("Failed to parse Outlook chunks", e);
+ }
+ }
+
+ public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+ throws TikaException, SAXException {
+ String subject = getChunk(CHUNKS.subjectChunk);
+ String from = getChunk(CHUNKS.displayFromChunk);
+
+ metadata.set(Metadata.AUTHOR, from);
+ metadata.set(Metadata.TITLE, subject);
+ metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
+
+ xhtml.element("h1", subject);
+ xhtml.characters("\n");
+
+ xhtml.startElement("dl");
+ header(xhtml, "From", from);
+ header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
+ header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
+ header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
+ xhtml.endElement("dl");
+ xhtml.characters("\n");
+
+ xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
+ }
+
+ private void header(XHTMLContentHandler xhtml, String key, String value)
+ throws SAXException {
+ if (value.length() > 0) {
+ xhtml.element("dt", key);
+ xhtml.characters("\t");
+ xhtml.element("dd", value);
+ xhtml.characters("\n");
+ }
+ }
+
+ /**
+ * Returns the content of the identified string chunk in the
+ * current document. Returns the empty string if the identified
+ * chunk does not exist in the current document.
+ *
+ * @param chunk string chunk identifier
+ * @return content of the identified chunk, or the empty string
+ */
+ private String getChunk(StringChunk chunk) {
+ try {
+ return parser.getDocumentNode(chunk).toString();
+ } catch (ChunkNotFoundException e) {
+ return "";
+ }
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=721926&r1=721925&r2=721926&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Sun Nov 30 15:15:09 2008
@@ -1,37 +1,37 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Text cell.
- */
-public class TextCell implements Cell {
-
- private final String text;
-
- public TextCell(String text) {
- this.text = text;
- }
-
- public void render(XHTMLContentHandler handler) throws SAXException {
- handler.characters(text);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Text cell.
+ */
+public class TextCell implements Cell {
+
+ private final String text;
+
+ public TextCell(String text) {
+ this.text = text;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.characters(text);
+ }
+
+}
Propchange: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
------------------------------------------------------------------------------
svn:eol-style = native