You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/09/21 16:57:18 UTC

svn commit: r578157 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/ src/main/java/org/apache/tika/parser/html/ src/main/java/org/apache/tika/parser/msexcel/ src/main/java/org/apache/tika/parser/mspowerpoint/ src/main/java/org/apach...

Author: jukka
Date: Fri Sep 21 07:57:17 2007
New Revision: 578157

URL: http://svn.apache.org/viewvc?rev=578157&view=rev
Log:
TIKA-12 - Decouple Parser from ParserConfig

Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Sep 21 07:57:17 2007
@@ -27,3 +27,4 @@
 
 12. TIKA-18 - "Office" interface should be renamed "MSOffice" (mattmann)
 
+13. TIKA-23 - Decouple Parser from ParserConfig (jukka)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Fri Sep 21 07:57:17 2007
@@ -20,9 +20,6 @@
 import java.util.List;
 
 import org.apache.tika.config.Content;
-import org.apache.tika.config.LiusConfig;
-import org.apache.tika.config.ParserConfig;
-import org.apache.tika.exception.LiusException;
 
 /**
  * Abstract class Parser
@@ -33,10 +30,12 @@
 
     private InputStream is;
 
-    private ParserConfig pc;
-
     private String mimeType;
 
+    private String namespace;
+
+    private List<Content> contents;
+
     public void setInputStream(InputStream is) {
         this.is = is;
     }
@@ -46,23 +45,6 @@
     }
 
     /**
-     * Configure parsers from mimetypes
-     */
-    public void configure(LiusConfig config) throws LiusException {
-        pc = config.getParserConfig(getMimeType());
-        if (pc.equals(null)) {
-            throw new LiusException("Please Configure your parser ");
-        }
-    }
-
-    /**
-     * Return parser specific config
-     */
-    public ParserConfig getParserConfig() {
-        return pc;
-    }
-
-    /**
      * Get document mime type
      */
     public String getMimeType() {
@@ -76,6 +58,14 @@
         this.mimeType = mimeType;
     }
 
+    public String getNamespace() {
+        return namespace;
+    }
+
+    public void setNamespace(String namespace) {
+        this.namespace = namespace;
+    }
+
     /**
      * Get the string content of the document
      */
@@ -93,6 +83,12 @@
      * LiusConfig Xml file. It could be a document metadata, XPath selection,
      * regex selection or fulltext
      */
-    public abstract List<Content> getContents();
+    public List<Content> getContents() {
+        return contents;
+    }
+
+    public void setContents(List<Content> contents) {
+        this.contents = contents;
+    }
 
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java Fri Sep 21 07:57:17 2007
@@ -76,9 +76,9 @@
                 logger.error(e.getMessage());
             }
             parser.setMimeType(mimeType);
-            parser.configure(tc);
+            parser.setNamespace(pc.getNameSpace());
+            parser.setContents(pc.getContents());
             parser.setInputStream(new FileInputStream(file));
-
         }
 
         return parser;

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Fri Sep 21 07:57:17 2007
@@ -59,7 +59,7 @@
         if (contentStr == null) {
             contentStr = getStrContent();
         }
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
@@ -88,7 +88,7 @@
             contentsMap.put(ct.getName(), ct);
         }
 
-        return getParserConfig().getContents();
+        return ctt;
 
     }
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java Fri Sep 21 07:57:17 2007
@@ -54,7 +54,7 @@
         if (contentStr == null) {
             contentStr = getStrContent();
         }
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
@@ -79,8 +79,7 @@
             contentsMap.put(ct.getName(), ct);
         }
 
-        return getParserConfig().getContents();
-
+        return ctt;
     }
 
     public String getStrContent() {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java Fri Sep 21 07:57:17 2007
@@ -54,7 +54,7 @@
         if (contentStr == null) {
             contentStr = getStrContent();
         }
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
@@ -79,8 +79,7 @@
             contentsMap.put(ct.getName(), ct);
         }
 
-        return getParserConfig().getContents();
-
+        return ctt;
     }
 
     /*
@@ -92,7 +91,7 @@
      */
 
     public String getStrContent() {
-        extrator.setContents(getParserConfig().getContents());
+        extrator.setContents(super.getContents());
         try {
             contentStr = extrator.extractText(getInputStream());
         } catch (Exception e) {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java Fri Sep 21 07:57:17 2007
@@ -55,7 +55,7 @@
         if (contentStr == null) {
             contentStr = getStrContent();
         }
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
@@ -80,7 +80,7 @@
             contentsMap.put(ct.getName(), ct);
         }
 
-        return getParserConfig().getContents();
+        return ctt;
 
     }
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Fri Sep 21 07:57:17 2007
@@ -97,7 +97,7 @@
         if (xmlDoc == null)
             xmlDoc = Utils.parse(getInputStream());
         List<String> documentNs = xp.getAllDocumentNs(xmlDoc);
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         Iterator it = ctt.iterator();
         contentsMap = new HashMap<String, Content>();
 
@@ -117,10 +117,9 @@
                     logger.error(e.getMessage());
                 }
             }
-
         }
-        return getParserConfig().getContents();
 
+        return ctt;
     }
 
     public String getStrContent() {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri Sep 21 07:57:17 2007
@@ -85,7 +85,7 @@
         if (contentStr == null) {
             contentStr = getStrContent();
         }
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
@@ -183,7 +183,7 @@
             contentsMap.put(ct.getName(), ct);
         }
 
-        return getParserConfig().getContents();
+        return ctt;
     }
 
     public Content getContent(String name) {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Fri Sep 21 07:57:17 2007
@@ -57,7 +57,7 @@
         if (contentStr == null) {
             contentStr = getStrContent();
         }
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
@@ -82,8 +82,7 @@
             contentsMap.put(ct.getName(), ct);
         }
 
-        return getParserConfig().getContents();
-
+        return ctt;
     }
 
     @Override

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Fri Sep 21 07:57:17 2007
@@ -56,7 +56,7 @@
         if (contentStr == null) {
             contentStr = getStrContent();
         }
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
@@ -81,7 +81,7 @@
             contentsMap.put(ct.getName(), ct);
         }
 
-        return getParserConfig().getContents();
+        return ctt;
 
     }
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=578157&r1=578156&r2=578157&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Fri Sep 21 07:57:17 2007
@@ -79,10 +79,10 @@
         if (xmlDoc == null)
             xmlDoc = Utils.parse(getInputStream());
         List<String> documentNs = getAllDocumentNs(xmlDoc);
-        List<Content> ctt = getParserConfig().getContents();
+        List<Content> ctt = super.getContents();
         Iterator it = ctt.iterator();
         contentsMap = new HashMap<String, Content>();
-        if (exist(documentNs, getParserConfig().getNameSpace())) {
+        if (exist(documentNs, getNamespace())) {
             while (it.hasNext()) {
                 Content content = (Content) it.next();
                 if (content.getXPathSelect() != null) {
@@ -99,10 +99,10 @@
                         logger.error(e.getMessage());
                     }
                 }
-
             }
         }
-        return getParserConfig().getContents();
+
+        return ctt;
     }
 
     public String concatOccurance(Object xmlDoc, String xpath, String concatSep) {