You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/09/25 13:48:42 UTC
svn commit: r579208 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/config/ src/main/java/org/apache/tika/parser/
src/main/java/org/apache/tika/parser/html/
src/main/java/org/apache/tika/parser/msexcel/
src/main/java/org/apache/tika/parser...
Author: jukka
Date: Tue Sep 25 04:48:38 2007
New Revision: 579208
URL: http://svn.apache.org/viewvc?rev=579208&view=rev
Log:
TIKA-26 - Use Map<String, Content> instead of List<Content>
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/config/Content.java
incubator/tika/trunk/src/main/java/org/apache/tika/config/ParserConfig.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Tue Sep 25 04:48:38 2007
@@ -44,3 +44,5 @@
20. TIKA-30 - Added utility constructors to TikaConfig (K. Bennett & jukka)
21. TIKA-28 - Rename config.xml to tika-config.xml or similar (mattmann)
+
+22. TIKA-26 - Use Map<String, Content> instead of List<Content> (jukka)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/config/Content.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/config/Content.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/config/Content.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/config/Content.java Tue Sep 25 04:48:38 2007
@@ -24,8 +24,6 @@
*/
public class Content {
- private final String name;
-
private final String textSelect;
private final String xPathSelect;
@@ -37,14 +35,9 @@
private String[] values;
public Content(Element element) {
- name = element.getAttributeValue("name");
xPathSelect = element.getAttributeValue("xpathSelect");
textSelect = element.getAttributeValue("textSelect");
regexSelect = element.getChildTextTrim("regexSelect");
- }
-
- public String getName() {
- return name;
}
public String getRegexSelect() {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/config/ParserConfig.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/config/ParserConfig.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/config/ParserConfig.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/config/ParserConfig.java Tue Sep 25 04:48:38 2007
@@ -16,9 +16,9 @@
*/
package org.apache.tika.config;
-import java.util.ArrayList;
import java.util.Collections;
-import java.util.List;
+import java.util.HashMap;
+import java.util.Map;
import org.jdom.Element;
@@ -33,8 +33,8 @@
private final String nameSpace;
- private final List<Content> contents = new ArrayList<Content>();
-;
+ private final Map<String, Content> contents =
+ new HashMap<String, Content>();
public ParserConfig(Element element) {
name = element.getAttributeValue("name");
@@ -43,7 +43,8 @@
Element extract = element.getChild("extract");
if (extract != null) {
for (Object child : extract.getChildren()) {
- contents.add(new Content((Element) child));
+ String name = ((Element) child).getAttributeValue("name");
+ contents.put(name, new Content((Element) child));
}
}
}
@@ -60,8 +61,8 @@
return parserClass;
}
- public List<Content> getContents() {
- return Collections.unmodifiableList(contents);
+ public Map<String, Content> getContents() {
+ return Collections.unmodifiableMap(contents);
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Tue Sep 25 04:48:38 2007
@@ -17,7 +17,7 @@
package org.apache.tika.parser;
import java.io.InputStream;
-import java.util.List;
+import java.util.Map;
import org.apache.tika.config.Content;
@@ -32,7 +32,7 @@
private String namespace;
- private List<Content> contents;
+ private Map<String, Content> contents;
protected String contentStr;
@@ -80,12 +80,7 @@
* fulltext
*/
public Content getContent(String name) {
- for (Content content : getContents()) {
- if (name.equals(content.getName())) {
- return content;
- }
- }
- return null;
+ return getContents().get(name);
}
/**
@@ -93,11 +88,11 @@
* TikaConfig Xml file. It could be a document metadata, XPath selection,
* regex selection or fulltext
*/
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
return contents;
}
- public void setContents(List<Content> contents) {
+ public void setContents(Map<String, Content> contents) {
this.contents = contents;
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Tue Sep 25 04:48:38 2007
@@ -17,9 +17,9 @@
package org.apache.tika.parser.html;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import org.apache.log4j.Logger;
import org.apache.oro.text.regex.MalformedPatternException;
@@ -42,19 +42,15 @@
private Node root = null;
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (contentStr == null) {
if (root == null)
root = getRoot(getInputStream());
contentStr = getTextContent(root);
}
- List<Content> ctt = super.getContents();
+ Map<String, Content> ctt = super.getContents();
- if (ctt == null) {
- return new ArrayList<Content>(0);
- }
-
- Iterator i = ctt.iterator();
+ Iterator i = ctt.values().iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
if (ct.getTextSelect() != null) {
@@ -94,7 +90,7 @@
private void extractElementTxt(Element root, Content content) {
- NodeList children = root.getElementsByTagName(content.getName());
+ NodeList children = root.getElementsByTagName(content.getTextSelect());
if (children != null) {
if (children.getLength() > 0) {
if (children.getLength() == 1) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java Tue Sep 25 04:48:38 2007
@@ -18,6 +18,7 @@
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import org.apache.tika.config.Content;
import org.apache.tika.parser.Parser;
@@ -37,7 +38,7 @@
static Logger logger = Logger.getRootLogger();
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (contentStr == null) {
// extrator.setContents(getParserConfig().getContents());
try {
@@ -47,8 +48,8 @@
e.printStackTrace();
}
}
- List<Content> ctt = super.getContents();
- Iterator i = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator i = ctt.values().iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
if (ct.getTextSelect() != null) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java Tue Sep 25 04:48:38 2007
@@ -39,7 +39,7 @@
static Logger logger = Logger.getRootLogger();
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (contentStr == null) {
extrator.setContents(super.getContents());
try {
@@ -49,8 +49,8 @@
e.printStackTrace();
}
}
- List<Content> ctt = super.getContents();
- Iterator i = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator i = ctt.values().iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
if (ct.getTextSelect() != null) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java Tue Sep 25 04:48:38 2007
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.msword;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -40,7 +39,7 @@
static Logger logger = Logger.getRootLogger();
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (contentStr == null) {
// extractor
try {
@@ -50,8 +49,8 @@
e.printStackTrace();
}
}
- List<Content> ctt = super.getContents();
- Iterator i = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator i = ctt.values().iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
if (ct.getTextSelect() != null) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Tue Sep 25 04:48:38 2007
@@ -25,6 +25,7 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@@ -82,15 +83,15 @@
return xmlDoc;
}
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (xmlDoc == null)
xmlDoc = parse(getInputStream());
if (contentStr == null) {
contentStr = xp.concatOccurance(xmlDoc, "//*", " ");
}
List<String> documentNs = xp.getAllDocumentNs(xmlDoc);
- List<Content> ctt = super.getContents();
- Iterator it = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator it = ctt.values().iterator();
while (it.hasNext()) {
Content content = (Content) it.next();
if (content.getXPathSelect() != null) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue Sep 25 04:48:38 2007
@@ -20,6 +20,7 @@
import java.io.StringWriter;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import org.apache.tika.config.Content;
import org.apache.tika.parser.Parser;
@@ -43,7 +44,7 @@
private PDDocument pdfDocument = null;
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
// String contents = getContent();
if (contentStr == null) {
try {
@@ -72,8 +73,8 @@
}
}
}
- List<Content> ctt = super.getContents();
- Iterator i = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator i = ctt.values().iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Tue Sep 25 04:48:38 2007
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
@@ -40,7 +41,7 @@
static Logger logger = Logger.getRootLogger();
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (contentStr == null) {
try {
DefaultStyledDocument sd = new DefaultStyledDocument();
@@ -53,8 +54,8 @@
logger.error(j.getMessage());
}
}
- List<Content> ctt = super.getContents();
- Iterator i = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator i = ctt.values().iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
if (ct.getTextSelect() != null) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Tue Sep 25 04:48:38 2007
@@ -20,7 +20,6 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -41,7 +40,7 @@
static Logger logger = Logger.getRootLogger();
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (contentStr == null) {
StringBuffer sb = new StringBuffer();
try {
@@ -65,8 +64,8 @@
}
contentStr = sb.toString();
}
- List<Content> ctt = super.getContents();
- Iterator i = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator i = ctt.values().iterator();
while (i.hasNext()) {
Content ct = (Content) i.next();
if (ct.getTextSelect() != null) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Tue Sep 25 04:48:38 2007
@@ -19,6 +19,7 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import org.apache.tika.config.Content;
import org.apache.tika.parser.Parser;
@@ -52,7 +53,7 @@
private SimpleNamespaceContext nsc = new SimpleNamespaceContext();
- public List<Content> getContents() {
+ public Map<String, Content> getContents() {
if (contentStr == null) {
if (xmlDoc == null)
xmlDoc = Utils.parse(getInputStream());
@@ -61,8 +62,8 @@
if (xmlDoc == null)
xmlDoc = Utils.parse(getInputStream());
List<String> documentNs = getAllDocumentNs(xmlDoc);
- List<Content> ctt = super.getContents();
- Iterator it = ctt.iterator();
+ Map<String, Content> ctt = super.getContents();
+ Iterator it = ctt.values().iterator();
if (exist(documentNs, getNamespace())) {
while (it.hasNext()) {
Content content = (Content) it.next();
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java Tue Sep 25 04:48:38 2007
@@ -18,9 +18,7 @@
// JDK imports
import java.io.InputStream;
-import java.util.Date;
-import java.util.List;
-import java.util.Properties;
+import java.util.Map;
import org.apache.tika.config.Content;
// Jakarta POI imports
@@ -44,13 +42,13 @@
private POIFSReader reader = null;
- private List<Content> contents;
+ private Map<String, Content> contents;
/** Constructs a new Microsoft document extractor. */
public MSExtractor() {
}
- public void setContents(List<Content> contents){
+ public void setContents(Map<String, Content> contents){
this.contents = contents;
}
@@ -61,7 +59,7 @@
// First, extract properties
this.reader = new POIFSReader();
- this.reader.registerListener(new PropertiesReaderListener(contents),
+ this.reader.registerListener(new PropertiesReaderListener(),
SummaryInformation.DEFAULT_STREAM_NAME);
//input.reset();
if (input.available() > 0) {
@@ -86,11 +84,6 @@
}
private class PropertiesReaderListener implements POIFSReaderListener {
- private List<Content> contents;
-
- PropertiesReaderListener(List<Content> contents) {
- this.contents = contents;
- }
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
if (!event.getName().startsWith(
@@ -101,9 +94,7 @@
try {
SummaryInformation si = (SummaryInformation) PropertySetFactory
.create(event.getStream());
-
- for (int i = 0; i < contents.size(); i++) {
- Content content = contents.get(i);
+ for (Content content : contents.values()) {
if (content.getTextSelect().equalsIgnoreCase("title")) {
content.setValue(si.getTitle());
}
@@ -137,25 +128,6 @@
else if (content.getTextSelect().equalsIgnoreCase("")) {
//content.setValue(si.getCharCount());
}
- else if (content.getTextSelect().equals("")) {
-
- }
- else if (content.getTextSelect().equals("")) {
-
- }
- else if (content.getTextSelect().equals("")) {
-
- }
- else if (content.getTextSelect().equals("")) {
-
- }
- else if (content.getTextSelect().equals("")) {
-
- }
- else if (content.getTextSelect().equals("")) {
-
- }
- System.out.println(content.getName()+" :"+content.getValue());
}
} catch (Exception ex) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=579208&r1=579207&r2=579208&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Tue Sep 25 04:48:38 2007
@@ -32,6 +32,7 @@
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@@ -53,26 +54,25 @@
static Logger logger = Logger.getRootLogger();
- public static String toString(Collection<Content> structuredContent) {
+ public static String toString(Map<String, Content> structuredContent) {
final StringWriter sw = new StringWriter();
print(structuredContent,sw);
return sw.toString();
}
- public static void print(Collection<Content> structuredContent) {
+ public static void print(Map<String, Content> structuredContent) {
print(structuredContent,new OutputStreamWriter(System.out));
}
- public static void print(Collection<Content> structuredContent,Writer outputWriter) {
+ public static void print(Map<String, Content> structuredContent,Writer outputWriter) {
final PrintWriter output = new PrintWriter(outputWriter,true);
- for (Iterator<Content> iter = structuredContent.iterator(); iter
- .hasNext();) {
- Content ct = iter.next();
+ for (Map.Entry<String, Content> entry : structuredContent.entrySet()) {
+ Content ct = entry.getValue();
if (ct.getValue() != null) {
- output.print(ct.getName() + ": ");
+ output.print(entry.getKey() + ": ");
output.println(ct.getValue());
} else if (ct.getValues() != null) {
- output.print(ct.getName() + ": ");
+ output.print(entry.getKey() + ": ");
for (int j = 0; j < ct.getValues().length; j++) {
if (j == 0)
output.println(ct.getValues()[j]);
@@ -82,7 +82,7 @@
}
} else { // there are no values, but there is a Content object
System.out.println(
- "Content '" + ct.getName() + "' has no values.");
+ "Content '" + entry.getKey() + "' has no values.");
}
}
}