You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 02:02:09 UTC
svn commit: r891092 - in /lucene/tika/trunk/tika-parsers: ./
src/main/java/org/apache/tika/parser/microsoft/ooxml/
Author: jukka
Date: Wed Dec 16 01:02:09 2009
New Revision: 891092
URL: http://svn.apache.org/viewvc?rev=891092&view=rev
Log:
TIKA-353: Upgrade to POI 3.6
Modified:
lucene/tika/trunk/tika-parsers/pom.xml
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Wed Dec 16 01:02:09 2009
@@ -35,7 +35,7 @@
<url>http://lucene.apache.org/tika/</url>
<properties>
- <poi.version>3.5-FINAL</poi.version>
+ <poi.version>3.6</poi.version>
</properties>
<dependencies>
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Wed Dec 16 01:02:09 2009
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.util.Locale;
+
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.xslf.XSLFSlideShow;
@@ -31,7 +33,8 @@
*/
public class OOXMLExtractorFactory {
- public static OOXMLExtractor createExtractor(POIXMLTextExtractor extractor) {
+ public static OOXMLExtractor createExtractor(
+ POIXMLTextExtractor extractor, Locale locale) {
POIXMLDocument document = extractor.getDocument();
if (document instanceof XSLFSlideShow) {
@@ -39,7 +42,7 @@
(XSLFPowerPointExtractor) extractor);
} else if (document instanceof XSSFWorkbook) {
return new XSSFExcelExtractorDecorator(
- (XSSFExcelExtractor) extractor);
+ (XSSFExcelExtractor) extractor, locale);
} else if (document instanceof XWPFDocument) {
return new XWPFWordExtractorDecorator((XWPFWordExtractor) extractor);
} else {
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Wed Dec 16 01:02:09 2009
@@ -18,6 +18,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.Locale;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
@@ -42,9 +43,10 @@
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
try {
- OOXMLExtractor extractor = OOXMLExtractorFactory
- .createExtractor((POIXMLTextExtractor) ExtractorFactory
- .createExtractor(stream));
+ Locale locale = context.get(Locale.class, Locale.getDefault());
+ OOXMLExtractor extractor = OOXMLExtractorFactory.createExtractor(
+ (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream),
+ locale);
extractor.getMetadataExtractor().extract(metadata);
extractor.getXHTML(handler, metadata);
} catch (InvalidFormatException e) {
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=891092&r1=891091&r2=891092&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Wed Dec 16 01:02:09 2009
@@ -17,7 +17,9 @@
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
+import java.text.NumberFormat;
import java.util.Iterator;
+import java.util.Locale;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ss.usermodel.Cell;
@@ -34,8 +36,18 @@
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
- public XSSFExcelExtractorDecorator(XSSFExcelExtractor extractor) {
+ /**
+ * Format for rendering numbers in the worksheet. Currently we just
+ * use the platform default formatting.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
+ */
+ private final NumberFormat format;
+
+ public XSSFExcelExtractorDecorator(
+ XSSFExcelExtractor extractor, Locale locale) {
super(extractor, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ this.format = NumberFormat.getInstance(locale);
}
/**
@@ -67,10 +79,16 @@
xhtml.startElement("td");
Cell cell = ri.next();
- if (cell.getCellType() == Cell.CELL_TYPE_FORMULA
- || cell.getCellType() == Cell.CELL_TYPE_STRING) {
+ int type = cell.getCellType();
+ if (type == Cell.CELL_TYPE_FORMULA) {
+ type = cell.getCachedFormulaResultType();
+ }
+ if (type == Cell.CELL_TYPE_STRING) {
xhtml.characters(cell.getRichStringCellValue()
.getString());
+ } else if (type == Cell.CELL_TYPE_NUMERIC) {
+ xhtml.characters(
+ format.format(cell.getNumericCellValue()));
} else {
XSSFCell xc = (XSSFCell) cell;
String rawValue = xc.getRawValue();