You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/09/25 00:55:11 UTC

svn commit: r579006 - in /incubator/tika/trunk/src/main/java/org/apache/tika/parser: ./ html/ msexcel/ mspowerpoint/ msword/ opendocument/ pdf/ rtf/ txt/ xml/

Author: jukka
Date: Mon Sep 24 15:55:03 2007
New Revision: 579006

URL: http://svn.apache.org/viewvc?rev=579006&view=rev
Log:
TIKA-26 - Implemented Parser.getContent(String) in the base class

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Mon Sep 24 15:55:03 2007
@@ -76,7 +76,14 @@
      * It could be a document metadata, XPath selection, regex selection or
      * fulltext
      */
-    public abstract Content getContent(String name);
+    public final Content getContent(String name) {
+        for (Content content : getContents()) {
+            if (name.equals(content.getName())) {
+                return content;
+            }
+        }
+        return null;
+    }
 
     /**
      * Get a List of contents objects, this objects are configured from the

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Mon Sep 24 15:55:03 2007
@@ -46,22 +46,11 @@
 
     private String contentStr;
 
-    private Map<String, Content> contentsMap;
-
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
-    }
-
     public List<Content> getContents() {
         if (contentStr == null) {
             contentStr = getStrContent();
         }
         List<Content> ctt = super.getContents();
-        contentsMap = new HashMap<String, Content>();
-
 
         if (ctt == null) {
             return new ArrayList<Content>(0);
@@ -91,7 +80,6 @@
                     logger.error(e.getMessage());
                 }
             }
-            contentsMap.put(ct.getName(), ct);
         }
 
         return ctt;

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java Mon Sep 24 15:55:03 2007
@@ -39,23 +39,13 @@
 
     private String contentStr;
 
-    private Map<String, Content> contentsMap;
-
     static Logger logger = Logger.getRootLogger();
 
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
-    }
-
     public List<Content> getContents() {
         if (contentStr == null) {
             contentStr = getStrContent();
         }
         List<Content> ctt = super.getContents();
-        contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
             Content ct = (Content) i.next();
@@ -76,7 +66,6 @@
                     logger.error(e.getMessage());
                 }
             }
-            contentsMap.put(ct.getName(), ct);
         }
 
         return ctt;

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java Mon Sep 24 15:55:03 2007
@@ -39,23 +39,13 @@
 
     private String contentStr;
 
-    private Map<String, Content> contentsMap;
-
     static Logger logger = Logger.getRootLogger();
 
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
-    }
-
     public List<Content> getContents() {
         if (contentStr == null) {
             contentStr = getStrContent();
         }
         List<Content> ctt = super.getContents();
-        contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
             Content ct = (Content) i.next();
@@ -76,7 +66,6 @@
                     logger.error(e.getMessage());
                 }
             }
-            contentsMap.put(ct.getName(), ct);
         }
 
         return ctt;

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java Mon Sep 24 15:55:03 2007
@@ -40,23 +40,13 @@
 
     private String contentStr;
 
-    private Map<String, Content> contentsMap;
-
     static Logger logger = Logger.getRootLogger();
 
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
-    }
-
     public List<Content> getContents() {
         if (contentStr == null) {
             contentStr = getStrContent();
         }
         List<Content> ctt = super.getContents();
-        contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
             Content ct = (Content) i.next();
@@ -77,7 +67,6 @@
                     logger.error(e.getMessage());
                 }
             }
-            contentsMap.put(ct.getName(), ct);
         }
 
         return ctt;

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Mon Sep 24 15:55:03 2007
@@ -58,8 +58,6 @@
 
     private org.jdom.Document xmlDoc;
 
-    private Map<String, Content> contentsMap;
-
     private String contentStr;
 
     public org.jdom.Document parse(InputStream is) {
@@ -98,12 +96,10 @@
         List<String> documentNs = xp.getAllDocumentNs(xmlDoc);
         List<Content> ctt = super.getContents();
         Iterator it = ctt.iterator();
-        contentsMap = new HashMap<String, Content>();
-
         while (it.hasNext()) {
             Content content = (Content) it.next();
             if (content.getXPathSelect() != null) {
-                xp.extractContent(xmlDoc, content, contentsMap);
+                xp.extractContent(xmlDoc, content);
             } else if (content.getRegexSelect() != null) {
                 try {
                     List<String> valuesLs = RegexUtils.extract(contentStr,
@@ -152,13 +148,6 @@
             logger.error(e.getMessage());
         }
         return res;
-    }
-
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
     }
 
     protected void copyInputStream(InputStream in, OutputStream out)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Mon Sep 24 15:55:03 2007
@@ -47,8 +47,6 @@
 
     private PDDocument pdfDocument = null;
 
-    private Map<String, Content> contentsMap;
-
     public String getStrContent() {
 
         try {
@@ -86,7 +84,6 @@
             contentStr = getStrContent();
         }
         List<Content> ctt = super.getContents();
-        contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
             Content ct = (Content) i.next();
@@ -180,17 +177,9 @@
                     logger.error(e.getMessage());
                 }
             }
-            contentsMap.put(ct.getName(), ct);
         }
 
         return ctt;
-    }
-
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
     }
 
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Mon Sep 24 15:55:03 2007
@@ -42,23 +42,13 @@
 
     static Logger logger = Logger.getRootLogger();
 
-    private Map<String, Content> contentsMap;
-
     private String contentStr;
 
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
-    }
-
     public List<Content> getContents() {
         if (contentStr == null) {
             contentStr = getStrContent();
         }
         List<Content> ctt = super.getContents();
-        contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
             Content ct = (Content) i.next();
@@ -79,7 +69,6 @@
                     logger.error(e.getMessage());
                 }
             }
-            contentsMap.put(ct.getName(), ct);
         }
 
         return ctt;

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Mon Sep 24 15:55:03 2007
@@ -41,23 +41,13 @@
 
     static Logger logger = Logger.getRootLogger();
 
-    private Map<String, Content> contentsMap;
-
     private String contentStr;
 
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
-    }
-
     public List<Content> getContents() {
         if (contentStr == null) {
             contentStr = getStrContent();
         }
         List<Content> ctt = super.getContents();
-        contentsMap = new HashMap<String, Content>();
         Iterator i = ctt.iterator();
         while (i.hasNext()) {
             Content ct = (Content) i.next();
@@ -78,7 +68,6 @@
                     logger.error(e.getMessage());
                 }
             }
-            contentsMap.put(ct.getName(), ct);
         }
 
         return ctt;

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=579006&r1=579005&r2=579006&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Mon Sep 24 15:55:03 2007
@@ -54,17 +54,8 @@
 
     private SimpleNamespaceContext nsc = new SimpleNamespaceContext();
 
-    private Map<String, Content> contentsMap;
-
     private String contentStr;
 
-    public Content getContent(String name) {
-        if (contentsMap == null || contentsMap.isEmpty()) {
-            getContents();
-        }
-        return contentsMap.get(name);
-    }
-
     public String getStrContent() {
         if (xmlDoc == null)
             xmlDoc = Utils.parse(getInputStream());
@@ -81,12 +72,11 @@
         List<String> documentNs = getAllDocumentNs(xmlDoc);
         List<Content> ctt = super.getContents();
         Iterator it = ctt.iterator();
-        contentsMap = new HashMap<String, Content>();
         if (exist(documentNs, getNamespace())) {
             while (it.hasNext()) {
                 Content content = (Content) it.next();
                 if (content.getXPathSelect() != null) {
-                    extractContent(xmlDoc, content, contentsMap);
+                    extractContent(xmlDoc, content);
                 } else if (content.getRegexSelect() != null) {
                     try {
                         List<String> valuesLs = RegexUtils.extract(contentStr,
@@ -201,8 +191,7 @@
         }
     }
 
-    public void extractContent(Document xmlDoc, Content content,
-            Map<String, Content> contentsMap) {
+    public void extractContent(Document xmlDoc, Content content) {
         try {
             JDOMXPath xp = new JDOMXPath(content.getXPathSelect());
             xp.setNamespaceContext(nsc);
@@ -243,7 +232,6 @@
             if (values.length > 0) {
                 content.setValue(values[0]);
                 content.setValues(values);
-                contentsMap.put(content.getName(), content);
             }
         } catch (JaxenException e) {
             logger.error(e.getMessage());