You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/11/25 23:52:10 UTC
svn commit: r598075 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/microsoft/
src/main/java/org/apache/tika/parser/opendocument/
src/main/java/org/apache/tika/parser/xml/
Author: jukka
Date: Sun Nov 25 14:52:09 2007
New Revision: 598075
URL: http://svn.apache.org/viewvc?rev=598075&view=rev
Log:
TIKA-102 - Parser implementations loading a large amount of content into a single String could be problematic
- Patch by Niall Pemberton
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Nov 25 14:52:09 2007
@@ -130,3 +130,5 @@
59. TIKA-101 - Improve site and build (mattmann)
+60. TIKA-102 - Parser implementations loading a large amount of content
+ into a single String could be problematic (Niall Pemberton)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java Sun Nov 25 14:52:09 2007
@@ -33,31 +33,29 @@
return "application/vnd.ms-excel";
}
- protected String extractText(POIFSFileSystem filesystem) throws IOException{
- StringBuilder builder = new StringBuilder();
+ protected void extractText(POIFSFileSystem filesystem, Appendable builder) throws IOException{
extractText(new HSSFWorkbook(filesystem), builder);
- return builder.toString();
}
- private void extractText(HSSFWorkbook book, StringBuilder builder) {
+ private void extractText(HSSFWorkbook book, Appendable builder) throws IOException {
for (int i = 0; book != null && i < book.getNumberOfSheets(); i++) {
extractText(book.getSheetAt(i), builder);
}
}
- private void extractText(HSSFSheet sheet, StringBuilder builder) {
+ private void extractText(HSSFSheet sheet, Appendable builder) throws IOException {
for (int i = 0; sheet != null && i <= sheet.getLastRowNum(); i++) {
extractText(sheet.getRow(i), builder);
}
}
- private void extractText(HSSFRow row, StringBuilder builder) {
+ private void extractText(HSSFRow row, Appendable builder) throws IOException {
for (short i = 0; row != null && i < row.getLastCellNum(); i++) {
extractText(row.getCell(i), builder);
}
}
- private void extractText(HSSFCell cell, StringBuilder builder) {
+ private void extractText(HSSFCell cell, Appendable builder) throws IOException {
if (cell != null) {
switch (cell.getCellType()) {
case HSSFCell.CELL_TYPE_STRING:
@@ -73,14 +71,11 @@
}
}
- private void addText(String text, StringBuilder builder) {
+ private void addText(String text, Appendable builder) throws IOException {
if (text != null) {
text = text.trim();
if (text.length() > 0) {
- if (builder.length() > 0) {
- builder.append(' ');
- }
- builder.append(text);
+ builder.append(text).append(' ');
}
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Nov 25 14:52:09 2007
@@ -29,6 +29,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.AppendableAdaptor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -56,7 +57,9 @@
XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- xhtml.element("p", extractText(filesystem));
+ xhtml.startElement("p");
+ extractText(filesystem, new AppendableAdaptor(xhtml));
+ xhtml.endElement("p");
xhtml.endDocument();
}
@@ -70,7 +73,7 @@
/**
* Extracts the text content from a Microsoft document input stream.
*/
- protected abstract String extractText(POIFSFileSystem filesystem)
+ protected abstract void extractText(POIFSFileSystem filesystem, Appendable appendable)
throws IOException, TikaException;
private void getMetadata(
@@ -177,4 +180,4 @@
}
}
-}
\ No newline at end of file
+}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java Sun Nov 25 14:52:09 2007
@@ -32,12 +32,12 @@
static Logger LOG = Logger.getRootLogger();
/** Buffer holding the content of the file */
- private final StringBuilder builder;
+ private final Appendable builder;
/**
* Constructs Listener to get content of PowerPoint file.
*/
- public PowerPointExtractor(StringBuilder builder) {
+ public PowerPointExtractor(Appendable builder) {
this.builder = builder;
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java Sun Nov 25 14:52:09 2007
@@ -35,8 +35,7 @@
return "application/vnd.ms-powerpoint";
}
- protected String extractText(POIFSFileSystem filesystem) throws IOException {
- StringBuilder builder = new StringBuilder();
+ protected void extractText(POIFSFileSystem filesystem, Appendable builder) throws IOException {
InputStream stream = filesystem.createDocumentInputStream(POWERPOINT);
try {
@@ -44,8 +43,6 @@
} finally {
stream.close();
}
-
- return builder.toString();
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java Sun Nov 25 14:52:09 2007
@@ -33,8 +33,11 @@
class Word6Extractor
{
- public Word6Extractor()
+ private final Appendable appendable;
+
+ public Word6Extractor(Appendable appendable)
{
+ this.appendable = appendable;
}
/**
@@ -45,7 +48,7 @@
* @return The text from the document
* @throws Exception If there are any unexpected exceptions.
*/
- public String extractText(byte[] mainStream) throws IOException {
+ public void extractText(byte[] mainStream) throws IOException {
int fcMin = LittleEndian.getInt(mainStream, 0x18);
int fcMax = LittleEndian.getInt(mainStream, 0x1C);
@@ -58,7 +61,7 @@
List textRuns = chpTable.getTextRuns();
// iterate through the
- WordTextBuffer finalTextBuf = new WordTextBuffer();
+ WordTextBuffer finalTextBuf = new WordTextBuffer(appendable);
Iterator runsIt = textRuns.iterator();
while(runsIt.hasNext())
{
@@ -76,8 +79,6 @@
}
}
}
-
- return finalTextBuf.toString();
}
/**
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java Sun Nov 25 14:52:09 2007
@@ -47,7 +47,7 @@
*
* @param in The InputStream representing the Word file.
*/
- public String extractText(POIFSFileSystem fsys)
+ public void extractText(POIFSFileSystem fsys, Appendable appendable)
throws IOException, TikaException {
// load our POIFS document streams.
DocumentEntry headerProps =
@@ -74,8 +74,8 @@
case 103:
case 104:
// this is a Word 6.0 doc send it to the extractor for that version.
- Word6Extractor oldExtractor = new Word6Extractor();
- return oldExtractor.extractText(header);
+ Word6Extractor oldExtractor = new Word6Extractor(appendable);
+ oldExtractor.extractText(header);
}
//get the location of the piece table
@@ -123,7 +123,7 @@
int currentTextStart = currentPiece.getStart();
int currentTextEnd = currentPiece.getEnd();
- WordTextBuffer finalTextBuf = new WordTextBuffer();
+ WordTextBuffer finalTextBuf = new WordTextBuffer(appendable);
// iterate through all text runs extract the text only if they haven't been
// deleted
@@ -157,7 +157,7 @@
runStart = currentTextStart;
currentTextEnd = currentPiece.getEnd ();
} else {
- return finalTextBuf.toString();
+ return;
}
}
String str = currentPiece.substring(0, runEnd - currentTextStart);
@@ -172,7 +172,6 @@
finalTextBuf.append(str);
}
}
- return finalTextBuf.toString();
}
/**
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java Sun Nov 25 14:52:09 2007
@@ -22,16 +22,16 @@
*/
public class WordTextBuffer
{
- StringBuffer _buf;
+ Appendable _buf;
boolean _hold;
- public WordTextBuffer()
+ public WordTextBuffer(Appendable appendable)
{
- _buf = new StringBuffer();
+ _buf = appendable;
_hold = false;
}
- public void append(String text)
+ public void append(String text) throws java.io.IOException
{
char[] letters = text.toCharArray();
for (int x = 0; x < letters.length; x++)
@@ -55,11 +55,6 @@
break;
}
}
- }
-
- public String toString()
- {
- return _buf.toString();
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Sun Nov 25 14:52:09 2007
@@ -31,6 +31,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.AppendableAdaptor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.log4j.Logger;
@@ -101,7 +102,9 @@
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- xhtml.element("p", xp.concatOccurrence(xmlDoc, "//*", " "));
+ xhtml.startElement("p");
+ xp.concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
+ xhtml.endElement("p");
xhtml.endDocument();
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=598075&r1=598074&r2=598075&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sun Nov 25 14:52:09 2007
@@ -25,6 +25,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.AppendableAdaptor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.Utils;
@@ -70,13 +71,14 @@
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- xhtml.element("p", concatOccurrence(xmlDoc, "//*", " "));
+ xhtml.startElement("p");
+ concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
+ xhtml.endElement("p");
xhtml.endDocument();
}
- public String concatOccurrence(Object xmlDoc, String xpath, String concatSep) {
+ public void concatOccurrence(Object xmlDoc, String xpath, String concatSep, Appendable chaineConcat) throws IOException {
- StringBuilder chaineConcat = new StringBuilder();
try {
JDOMXPath xp = new JDOMXPath(xpath);
List ls = xp.selectNodes(xmlDoc);
@@ -108,7 +110,7 @@
if (StringUtils.isNotEmpty(text)) {
chaineConcat.append(text);
if (ls.size() == 1) {
- return chaineConcat.toString().trim();
+ return;
} else {
if (ls.size() != j) {
chaineConcat.append(' ')
@@ -121,7 +123,6 @@
} catch (JaxenException j) {
logger.error(j.getMessage());
}
- return chaineConcat.toString().trim();
}
public List getAllDocumentNs(org.jdom.Document doc) {