You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/09/25 13:33:25 UTC
svn commit: r579207 - in
/incubator/tika/trunk/src/main/java/org/apache/tika/parser: ./ html/
msexcel/ mspowerpoint/ msword/ opendocument/ pdf/ rtf/ txt/ xml/
Author: jukka
Date: Tue Sep 25 04:33:23 2007
New Revision: 579207
URL: http://svn.apache.org/viewvc?rev=579207&view=rev
Log:
TIKA-26 - Implemented Parser.getStrContent() in the base class
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Tue Sep 25 04:33:23 2007
@@ -23,8 +23,6 @@
/**
* Abstract class Parser
- *
- *
*/
public abstract class Parser {
@@ -36,6 +34,8 @@
private List<Content> contents;
+ protected String contentStr;
+
public void setInputStream(InputStream is) {
this.is = is;
}
@@ -69,14 +69,17 @@
/**
* Get the string content of the document
*/
- public abstract String getStrContent();
+ public String getStrContent() {
+ getContents();
+ return contentStr;
+ }
/**
* Get a content object, this object is configured from the TikaConfig Xml.
* It could be a document metadata, XPath selection, regex selection or
* fulltext
*/
- public final Content getContent(String name) {
+ public Content getContent(String name) {
for (Content content : getContents()) {
if (name.equals(content.getName())) {
return content;
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Tue Sep 25 04:33:23 2007
@@ -18,10 +18,8 @@
import java.io.InputStream;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import org.apache.log4j.Logger;
import org.apache.oro.text.regex.MalformedPatternException;
@@ -44,11 +42,11 @@
private Node root = null;
- private String contentStr;
-
public List<Content> getContents() {
if (contentStr == null) {
- contentStr = getStrContent();
+ if (root == null)
+ root = getRoot(getInputStream());
+ contentStr = getTextContent(root);
}
List<Content> ctt = super.getContents();
@@ -84,13 +82,6 @@
return ctt;
- }
-
- public String getStrContent() {
- if (root == null)
- root = getRoot(getInputStream());
- contentStr = getTextContent(root);
- return contentStr;
}
private Node getRoot(InputStream is) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java Tue Sep 25 04:33:23 2007
@@ -16,10 +16,8 @@
*/
package org.apache.tika.parser.msexcel;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import org.apache.tika.config.Content;
import org.apache.tika.parser.Parser;
@@ -37,13 +35,17 @@
public class MsExcelParser extends Parser {
private MSExtractor extrator = new ExcelExtractor();
- private String contentStr;
-
static Logger logger = Logger.getRootLogger();
public List<Content> getContents() {
if (contentStr == null) {
- contentStr = getStrContent();
+ // extrator.setContents(getParserConfig().getContents());
+ try {
+ contentStr = extrator.extractText(getInputStream());
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
List<Content> ctt = super.getContents();
Iterator i = ctt.iterator();
@@ -71,14 +73,4 @@
return ctt;
}
- public String getStrContent() {
- // extrator.setContents(getParserConfig().getContents());
- try {
- contentStr = extrator.extractText(getInputStream());
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return contentStr;
- }
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java Tue Sep 25 04:33:23 2007
@@ -37,13 +37,17 @@
private PPTExtractor extrator = new PPTExtractor();
- private String contentStr;
-
static Logger logger = Logger.getRootLogger();
public List<Content> getContents() {
if (contentStr == null) {
- contentStr = getStrContent();
+ extrator.setContents(super.getContents());
+ try {
+ contentStr = extrator.extractText(getInputStream());
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
List<Content> ctt = super.getContents();
Iterator i = ctt.iterator();
@@ -78,16 +82,5 @@
* Auto-generated catch block e.printStackTrace(); } return
* getParserConfig().getContents(); }
*/
-
- public String getStrContent() {
- extrator.setContents(super.getContents());
- try {
- contentStr = extrator.extractText(getInputStream());
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return contentStr;
- }
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java Tue Sep 25 04:33:23 2007
@@ -38,13 +38,17 @@
private MSExtractor extractor = new WordExtractor();
- private String contentStr;
-
static Logger logger = Logger.getRootLogger();
public List<Content> getContents() {
if (contentStr == null) {
- contentStr = getStrContent();
+ // extractor
+ try {
+ contentStr = extractor.extractText(getInputStream());
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
List<Content> ctt = super.getContents();
Iterator i = ctt.iterator();
@@ -71,17 +75,6 @@
return ctt;
- }
-
- public String getStrContent() {
- // extractor
- try {
- contentStr = extractor.extractText(getInputStream());
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return contentStr;
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Tue Sep 25 04:33:23 2007
@@ -23,10 +23,8 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@@ -34,7 +32,6 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.utils.RegexUtils;
-import org.apache.tika.utils.Utils;
import org.apache.log4j.Logger;
import org.apache.oro.text.regex.MalformedPatternException;
@@ -58,8 +55,6 @@
private org.jdom.Document xmlDoc;
- private String contentStr;
-
public org.jdom.Document parse(InputStream is) {
xmlDoc = new org.jdom.Document();
org.jdom.Document xmlMeta = new org.jdom.Document();
@@ -88,11 +83,11 @@
}
public List<Content> getContents() {
+ if (xmlDoc == null)
+ xmlDoc = parse(getInputStream());
if (contentStr == null) {
- contentStr = getStrContent();
+ contentStr = xp.concatOccurance(xmlDoc, "//*", " ");
}
- if (xmlDoc == null)
- xmlDoc = Utils.parse(getInputStream());
List<String> documentNs = xp.getAllDocumentNs(xmlDoc);
List<Content> ctt = super.getContents();
Iterator it = ctt.iterator();
@@ -115,13 +110,6 @@
}
return ctt;
- }
-
- public String getStrContent() {
- if (xmlDoc == null)
- xmlDoc = parse(getInputStream());
- contentStr = xp.concatOccurance(xmlDoc, "//*", " ");
- return contentStr;
}
public List unzip(InputStream is) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue Sep 25 04:33:23 2007
@@ -18,10 +18,8 @@
import java.io.IOException;
import java.io.StringWriter;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import org.apache.tika.config.Content;
import org.apache.tika.parser.Parser;
@@ -43,45 +41,36 @@
public class PDFParser extends Parser {
static Logger logger = Logger.getRootLogger();
- private String contentStr = "";
-
private PDDocument pdfDocument = null;
- public String getStrContent() {
-
- try {
- pdfDocument = PDDocument.load(getInputStream());
- if (pdfDocument.isEncrypted()) {
- pdfDocument.decrypt("");
- }
- StringWriter writer = new StringWriter();
- PDFTextStripper stripper = new PDFTextStripper();
- stripper.writeText(pdfDocument, writer);
- contentStr = writer.getBuffer().toString();
- } catch (CryptographyException e) {
- logger.error(e.getMessage());
- } catch (IOException e) {
- e.printStackTrace();
- logger.error(e.getMessage());
- } catch (InvalidPasswordException e) {
- logger.error(e.getMessage());
- } finally {
- if (pdfDocument != null) {
- try {
- pdfDocument.close();
- } catch (IOException ex) {
- logger.error(ex.getMessage());
- }
- }
- }
- return contentStr;
- }
-
public List<Content> getContents() {
-
// String contents = getContent();
if (contentStr == null) {
- contentStr = getStrContent();
+ try {
+ pdfDocument = PDDocument.load(getInputStream());
+ if (pdfDocument.isEncrypted()) {
+ pdfDocument.decrypt("");
+ }
+ StringWriter writer = new StringWriter();
+ PDFTextStripper stripper = new PDFTextStripper();
+ stripper.writeText(pdfDocument, writer);
+ contentStr = writer.getBuffer().toString();
+ } catch (CryptographyException e) {
+ logger.error(e.getMessage());
+ } catch (IOException e) {
+ e.printStackTrace();
+ logger.error(e.getMessage());
+ } catch (InvalidPasswordException e) {
+ logger.error(e.getMessage());
+ } finally {
+ if (pdfDocument != null) {
+ try {
+ pdfDocument.close();
+ } catch (IOException ex) {
+ logger.error(ex.getMessage());
+ }
+ }
+ }
}
List<Content> ctt = super.getContents();
Iterator i = ctt.iterator();
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Tue Sep 25 04:33:23 2007
@@ -17,10 +17,8 @@
package org.apache.tika.parser.rtf;
import java.io.IOException;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
@@ -42,11 +40,18 @@
static Logger logger = Logger.getRootLogger();
- private String contentStr;
-
public List<Content> getContents() {
if (contentStr == null) {
- contentStr = getStrContent();
+ try {
+ DefaultStyledDocument sd = new DefaultStyledDocument();
+ RTFEditorKit kit = new RTFEditorKit();
+ kit.read(getInputStream(), sd, 0);
+ contentStr = sd.getText(0, sd.getLength());
+ } catch (IOException e) {
+ logger.error(e.getMessage());
+ } catch (BadLocationException j) {
+ logger.error(j.getMessage());
+ }
}
List<Content> ctt = super.getContents();
Iterator i = ctt.iterator();
@@ -72,21 +77,6 @@
}
return ctt;
- }
-
- @Override
- public String getStrContent() {
- try {
- DefaultStyledDocument sd = new DefaultStyledDocument();
- RTFEditorKit kit = new RTFEditorKit();
- kit.read(getInputStream(), sd, 0);
- contentStr = sd.getText(0, sd.getLength());
- } catch (IOException e) {
- logger.error(e.getMessage());
- } catch (BadLocationException j) {
- logger.error(j.getMessage());
- }
- return contentStr;
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Tue Sep 25 04:33:23 2007
@@ -41,11 +41,29 @@
static Logger logger = Logger.getRootLogger();
- private String contentStr;
-
public List<Content> getContents() {
if (contentStr == null) {
- contentStr = getStrContent();
+ StringBuffer sb = new StringBuffer();
+ try {
+ BufferedReader br = new BufferedReader(new InputStreamReader(
+ getInputStream()));
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ sb.append(line);
+ sb.append(" ");
+ }
+ } catch (FileNotFoundException ex) {
+ logger.error(ex.getMessage());
+ } catch (IOException ex1) {
+ logger.error(ex1.getMessage());
+ } finally {
+ try {
+ getInputStream().close();
+ } catch (IOException e) {
+ logger.error(e.getMessage());
+ }
+ }
+ contentStr = sb.toString();
}
List<Content> ctt = super.getContents();
Iterator i = ctt.iterator();
@@ -72,32 +90,6 @@
return ctt;
- }
-
- @Override
- public String getStrContent() {
- StringBuffer sb = new StringBuffer();
- try {
- BufferedReader br = new BufferedReader(new InputStreamReader(
- getInputStream()));
- String line = null;
- while ((line = br.readLine()) != null) {
- sb.append(line);
- sb.append(" ");
- }
- } catch (FileNotFoundException ex) {
- logger.error(ex.getMessage());
- } catch (IOException ex1) {
- logger.error(ex1.getMessage());
- } finally {
- try {
- getInputStream().close();
- } catch (IOException e) {
- logger.error(e.getMessage());
- }
- }
- contentStr = sb.toString();
- return contentStr;
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=579207&r1=579206&r2=579207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Tue Sep 25 04:33:23 2007
@@ -17,10 +17,8 @@
package org.apache.tika.parser.xml;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import org.apache.tika.config.Content;
import org.apache.tika.parser.Parser;
@@ -54,18 +52,11 @@
private SimpleNamespaceContext nsc = new SimpleNamespaceContext();
- private String contentStr;
-
- public String getStrContent() {
- if (xmlDoc == null)
- xmlDoc = Utils.parse(getInputStream());
- contentStr = concatOccurance(xmlDoc, "//*", " ");
- return contentStr;
- }
-
public List<Content> getContents() {
if (contentStr == null) {
- contentStr = getStrContent();
+ if (xmlDoc == null)
+ xmlDoc = Utils.parse(getInputStream());
+ contentStr = concatOccurance(xmlDoc, "//*", " ");
}
if (xmlDoc == null)
xmlDoc = Utils.parse(getInputStream());