You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/09/21 16:57:18 UTC
svn commit: r578157 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/
src/main/java/org/apache/tika/parser/html/
src/main/java/org/apache/tika/parser/msexcel/
src/main/java/org/apache/tika/parser/mspowerpoint/ src/main/java/org/apach...
Author: jukka
Date: Fri Sep 21 07:57:17 2007
New Revision: 578157
URL: http://svn.apache.org/viewvc?rev=578157&view=rev
Log:
TIKA-12 - Decouple Parser from ParserConfig
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Sep 21 07:57:17 2007
@@ -27,3 +27,4 @@
12. TIKA-18 - "Office" interface should be renamed "MSOffice" (mattmann)
+13. TIKA-23 - Decouple Parser from ParserConfig (jukka)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Fri Sep 21 07:57:17 2007
@@ -20,9 +20,6 @@
import java.util.List;
import org.apache.tika.config.Content;
-import org.apache.tika.config.LiusConfig;
-import org.apache.tika.config.ParserConfig;
-import org.apache.tika.exception.LiusException;
/**
* Abstract class Parser
@@ -33,10 +30,12 @@
private InputStream is;
- private ParserConfig pc;
-
private String mimeType;
+ private String namespace;
+
+ private List<Content> contents;
+
public void setInputStream(InputStream is) {
this.is = is;
}
@@ -46,23 +45,6 @@
}
/**
- * Configure parsers from mimetypes
- */
- public void configure(LiusConfig config) throws LiusException {
- pc = config.getParserConfig(getMimeType());
- if (pc.equals(null)) {
- throw new LiusException("Please Configure your parser ");
- }
- }
-
- /**
- * Return parser specific config
- */
- public ParserConfig getParserConfig() {
- return pc;
- }
-
- /**
* Get document mime type
*/
public String getMimeType() {
@@ -76,6 +58,14 @@
this.mimeType = mimeType;
}
+ public String getNamespace() {
+ return namespace;
+ }
+
+ public void setNamespace(String namespace) {
+ this.namespace = namespace;
+ }
+
/**
* Get the string content of the document
*/
@@ -93,6 +83,12 @@
* LiusConfig Xml file. It could be a document metadata, XPath selection,
* regex selection or fulltext
*/
- public abstract List<Content> getContents();
+ public List<Content> getContents() {
+ return contents;
+ }
+
+ public void setContents(List<Content> contents) {
+ this.contents = contents;
+ }
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java Fri Sep 21 07:57:17 2007
@@ -76,9 +76,9 @@
logger.error(e.getMessage());
}
parser.setMimeType(mimeType);
- parser.configure(tc);
+ parser.setNamespace(pc.getNameSpace());
+ parser.setContents(pc.getContents());
parser.setInputStream(new FileInputStream(file));
-
}
return parser;
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Fri Sep 21 07:57:17 2007
@@ -59,7 +59,7 @@
if (contentStr == null) {
contentStr = getStrContent();
}
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
Iterator i = ctt.iterator();
while (i.hasNext()) {
@@ -88,7 +88,7 @@
contentsMap.put(ct.getName(), ct);
}
- return getParserConfig().getContents();
+ return ctt;
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java Fri Sep 21 07:57:17 2007
@@ -54,7 +54,7 @@
if (contentStr == null) {
contentStr = getStrContent();
}
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
Iterator i = ctt.iterator();
while (i.hasNext()) {
@@ -79,8 +79,7 @@
contentsMap.put(ct.getName(), ct);
}
- return getParserConfig().getContents();
-
+ return ctt;
}
public String getStrContent() {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java Fri Sep 21 07:57:17 2007
@@ -54,7 +54,7 @@
if (contentStr == null) {
contentStr = getStrContent();
}
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
Iterator i = ctt.iterator();
while (i.hasNext()) {
@@ -79,8 +79,7 @@
contentsMap.put(ct.getName(), ct);
}
- return getParserConfig().getContents();
-
+ return ctt;
}
/*
@@ -92,7 +91,7 @@
*/
public String getStrContent() {
- extrator.setContents(getParserConfig().getContents());
+ extrator.setContents(super.getContents());
try {
contentStr = extrator.extractText(getInputStream());
} catch (Exception e) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java Fri Sep 21 07:57:17 2007
@@ -55,7 +55,7 @@
if (contentStr == null) {
contentStr = getStrContent();
}
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
Iterator i = ctt.iterator();
while (i.hasNext()) {
@@ -80,7 +80,7 @@
contentsMap.put(ct.getName(), ct);
}
- return getParserConfig().getContents();
+ return ctt;
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Fri Sep 21 07:57:17 2007
@@ -97,7 +97,7 @@
if (xmlDoc == null)
xmlDoc = Utils.parse(getInputStream());
List<String> documentNs = xp.getAllDocumentNs(xmlDoc);
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
Iterator it = ctt.iterator();
contentsMap = new HashMap<String, Content>();
@@ -117,10 +117,9 @@
logger.error(e.getMessage());
}
}
-
}
- return getParserConfig().getContents();
+ return ctt;
}
public String getStrContent() {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri Sep 21 07:57:17 2007
@@ -85,7 +85,7 @@
if (contentStr == null) {
contentStr = getStrContent();
}
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
Iterator i = ctt.iterator();
while (i.hasNext()) {
@@ -183,7 +183,7 @@
contentsMap.put(ct.getName(), ct);
}
- return getParserConfig().getContents();
+ return ctt;
}
public Content getContent(String name) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Fri Sep 21 07:57:17 2007
@@ -57,7 +57,7 @@
if (contentStr == null) {
contentStr = getStrContent();
}
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
Iterator i = ctt.iterator();
while (i.hasNext()) {
@@ -82,8 +82,7 @@
contentsMap.put(ct.getName(), ct);
}
- return getParserConfig().getContents();
-
+ return ctt;
}
@Override
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Fri Sep 21 07:57:17 2007
@@ -56,7 +56,7 @@
if (contentStr == null) {
contentStr = getStrContent();
}
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
contentsMap = new HashMap<String, Content>();
Iterator i = ctt.iterator();
while (i.hasNext()) {
@@ -81,7 +81,7 @@
contentsMap.put(ct.getName(), ct);
}
- return getParserConfig().getContents();
+ return ctt;
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Fri Sep 21 07:57:17 2007
@@ -79,10 +79,10 @@
if (xmlDoc == null)
xmlDoc = Utils.parse(getInputStream());
List<String> documentNs = getAllDocumentNs(xmlDoc);
- List<Content> ctt = getParserConfig().getContents();
+ List<Content> ctt = super.getContents();
Iterator it = ctt.iterator();
contentsMap = new HashMap<String, Content>();
- if (exist(documentNs, getParserConfig().getNameSpace())) {
+ if (exist(documentNs, getNamespace())) {
while (it.hasNext()) {
Content content = (Content) it.next();
if (content.getXPathSelect() != null) {
@@ -99,10 +99,10 @@
logger.error(e.getMessage());
}
}
-
}
}
- return getParserConfig().getContents();
+
+ return ctt;
}
public String concatOccurance(Object xmlDoc, String xpath, String concatSep) {