You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/05/29 16:36:24 UTC
svn commit: r1682489 [3/14] - in /tika/trunk:
tika-parsers/src/main/java/org/apache/tika/parser/html/
tika-parsers/src/main/java/org/apache/tika/parser/image/
tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/
tika-parsers/src/main/java/org/a...
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Fri May 29 14:36:21 2015
@@ -45,198 +45,198 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
public class HSLFExtractor extends AbstractPOIFSExtractor {
- public HSLFExtractor(ParseContext context) {
- super(context);
- }
-
- protected void parse(
- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
- throws IOException, SAXException, TikaException {
- parse(filesystem.getRoot(), xhtml);
- }
-
- protected void parse(
- DirectoryNode root, XHTMLContentHandler xhtml)
- throws IOException, SAXException, TikaException {
- HSLFSlideShow ss = new HSLFSlideShow(root);
- SlideShow _show = new SlideShow(ss);
- Slide[] _slides = _show.getSlides();
+ public HSLFExtractor(ParseContext context) {
+ super(context);
+ }
- xhtml.startElement("div", "class", "slideShow");
+ protected void parse(
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ parse(filesystem.getRoot(), xhtml);
+ }
- /* Iterate over slides and extract text */
- for( Slide slide : _slides ) {
- xhtml.startElement("div", "class", "slide");
+ protected void parse(
+ DirectoryNode root, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ HSLFSlideShow ss = new HSLFSlideShow(root);
+ SlideShow _show = new SlideShow(ss);
+ Slide[] _slides = _show.getSlides();
- // Slide header, if present
- HeadersFooters hf = slide.getHeadersFooters();
- if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
- xhtml.startElement("p", "class", "slide-header");
+ xhtml.startElement("div", "class", "slideShow");
- xhtml.characters( hf.getHeaderText() );
+ /* Iterate over slides and extract text */
+ for (Slide slide : _slides) {
+ xhtml.startElement("div", "class", "slide");
+
+ // Slide header, if present
+ HeadersFooters hf = slide.getHeadersFooters();
+ if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+ xhtml.startElement("p", "class", "slide-header");
- xhtml.endElement("p");
- }
+ xhtml.characters(hf.getHeaderText());
- // Slide master, if present
- extractMaster(xhtml, slide.getMasterSheet());
+ xhtml.endElement("p");
+ }
- // Slide text
- {
- xhtml.startElement("p", "class", "slide-content");
+ // Slide master, if present
+ extractMaster(xhtml, slide.getMasterSheet());
- textRunsToText(xhtml, slide.getTextRuns());
+ // Slide text
+ {
+ xhtml.startElement("p", "class", "slide-content");
- xhtml.endElement("p");
- }
+ textRunsToText(xhtml, slide.getTextRuns());
- // Table text
- for (Shape shape: slide.getShapes()){
- if (shape instanceof Table){
- extractTableText(xhtml, (Table)shape);
+ xhtml.endElement("p");
}
- }
- // Slide footer, if present
- if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
- xhtml.startElement("p", "class", "slide-footer");
+ // Table text
+ for (Shape shape : slide.getShapes()) {
+ if (shape instanceof Table) {
+ extractTableText(xhtml, (Table) shape);
+ }
+ }
- xhtml.characters( hf.getFooterText() );
+ // Slide footer, if present
+ if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+ xhtml.startElement("p", "class", "slide-footer");
- xhtml.endElement("p");
- }
+ xhtml.characters(hf.getFooterText());
- // Comments, if present
- for( Comment comment : slide.getComments() ) {
- xhtml.startElement("p", "class", "slide-comment");
- if (comment.getAuthor() != null) {
- xhtml.startElement("b");
- xhtml.characters( comment.getAuthor() );
- xhtml.endElement("b");
-
- if (comment.getText() != null) {
- xhtml.characters( " - ");
- }
+ xhtml.endElement("p");
}
- if (comment.getText() != null) {
- xhtml.characters( comment.getText() );
+
+ // Comments, if present
+ for (Comment comment : slide.getComments()) {
+ xhtml.startElement("p", "class", "slide-comment");
+ if (comment.getAuthor() != null) {
+ xhtml.startElement("b");
+ xhtml.characters(comment.getAuthor());
+ xhtml.endElement("b");
+
+ if (comment.getText() != null) {
+ xhtml.characters(" - ");
+ }
+ }
+ if (comment.getText() != null) {
+ xhtml.characters(comment.getText());
+ }
+ xhtml.endElement("p");
}
- xhtml.endElement("p");
- }
- // Now any embedded resources
- handleSlideEmbeddedResources(slide, xhtml);
+ // Now any embedded resources
+ handleSlideEmbeddedResources(slide, xhtml);
- // TODO Find the Notes for this slide and extract inline
+ // TODO Find the Notes for this slide and extract inline
- // Slide complete
- xhtml.endElement("div");
- }
+ // Slide complete
+ xhtml.endElement("div");
+ }
- // All slides done
- xhtml.endElement("div");
+ // All slides done
+ xhtml.endElement("div");
/* notes */
- xhtml.startElement("div", "class", "slideNotes");
- HashSet<Integer> seenNotes = new HashSet<Integer>();
- HeadersFooters hf = _show.getNotesHeadersFooters();
-
- for (Slide slide : _slides) {
- Notes notes = slide.getNotesSheet();
- if (notes == null) {
- continue;
- }
- Integer id = notes._getSheetNumber();
- if (seenNotes.contains(id)) {
- continue;
- }
- seenNotes.add(id);
-
- // Repeat the Notes header, if set
- if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
- xhtml.startElement("p", "class", "slide-note-header");
- xhtml.characters( hf.getHeaderText() );
- xhtml.endElement("p");
- }
-
- // Notes text
- textRunsToText(xhtml, notes.getTextRuns());
-
- // Repeat the notes footer, if set
- if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
- xhtml.startElement("p", "class", "slide-note-footer");
- xhtml.characters( hf.getFooterText() );
- xhtml.endElement("p");
- }
- }
-
- handleSlideEmbeddedPictures(_show, xhtml);
-
- xhtml.endElement("div");
- }
-
- private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException {
- if (master == null){
- return;
- }
- Shape[] shapes = master.getShapes();
- if (shapes == null || shapes.length == 0){
- return;
- }
-
- xhtml.startElement("div", "class", "slide-master-content");
- for (Shape shape : shapes){
- if (shape != null && ! MasterSheet.isPlaceholder(shape)){
- if (shape instanceof TextShape){
- TextShape tsh = (TextShape)shape;
- String text = tsh.getText();
- if (text != null){
- xhtml.element("p", text);
- }
- }
- }
- }
- xhtml.endElement("div");
- }
-
- private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException {
- xhtml.startElement("table");
- for (int row = 0; row < shape.getNumberOfRows(); row++){
- xhtml.startElement("tr");
- for (int col = 0; col < shape.getNumberOfColumns(); col++){
- TableCell cell = shape.getCell(row, col);
- //insert empty string for empty cell if cell is null
- String txt = "";
- if (cell != null){
- txt = cell.getText();
- }
- xhtml.element("td", txt);
- }
- xhtml.endElement("tr");
- }
- xhtml.endElement("table");
- }
-
- private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException {
- if (runs==null) {
- return;
- }
-
- for (TextRun run : runs) {
- if (run != null) {
- // Leaving in wisdom from TIKA-712 for easy revert.
- // Avoid boiler-plate text on the master slide (0
- // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
- //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
- String txt = run.getText();
- if (txt != null){
- xhtml.characters(txt);
- xhtml.startElement("br");
- xhtml.endElement("br");
- }
- }
- }
- }
+ xhtml.startElement("div", "class", "slideNotes");
+ HashSet<Integer> seenNotes = new HashSet<Integer>();
+ HeadersFooters hf = _show.getNotesHeadersFooters();
+
+ for (Slide slide : _slides) {
+ Notes notes = slide.getNotesSheet();
+ if (notes == null) {
+ continue;
+ }
+ Integer id = notes._getSheetNumber();
+ if (seenNotes.contains(id)) {
+ continue;
+ }
+ seenNotes.add(id);
+
+ // Repeat the Notes header, if set
+ if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+ xhtml.startElement("p", "class", "slide-note-header");
+ xhtml.characters(hf.getHeaderText());
+ xhtml.endElement("p");
+ }
+
+ // Notes text
+ textRunsToText(xhtml, notes.getTextRuns());
+
+ // Repeat the notes footer, if set
+ if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+ xhtml.startElement("p", "class", "slide-note-footer");
+ xhtml.characters(hf.getFooterText());
+ xhtml.endElement("p");
+ }
+ }
+
+ handleSlideEmbeddedPictures(_show, xhtml);
+
+ xhtml.endElement("div");
+ }
+
+ private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException {
+ if (master == null) {
+ return;
+ }
+ Shape[] shapes = master.getShapes();
+ if (shapes == null || shapes.length == 0) {
+ return;
+ }
+
+ xhtml.startElement("div", "class", "slide-master-content");
+ for (Shape shape : shapes) {
+ if (shape != null && !MasterSheet.isPlaceholder(shape)) {
+ if (shape instanceof TextShape) {
+ TextShape tsh = (TextShape) shape;
+ String text = tsh.getText();
+ if (text != null) {
+ xhtml.element("p", text);
+ }
+ }
+ }
+ }
+ xhtml.endElement("div");
+ }
+
+ private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException {
+ xhtml.startElement("table");
+ for (int row = 0; row < shape.getNumberOfRows(); row++) {
+ xhtml.startElement("tr");
+ for (int col = 0; col < shape.getNumberOfColumns(); col++) {
+ TableCell cell = shape.getCell(row, col);
+ //insert empty string for empty cell if cell is null
+ String txt = "";
+ if (cell != null) {
+ txt = cell.getText();
+ }
+ xhtml.element("td", txt);
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+ }
+
+ private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException {
+ if (runs == null) {
+ return;
+ }
+
+ for (TextRun run : runs) {
+ if (run != null) {
+ // Leaving in wisdom from TIKA-712 for easy revert.
+ // Avoid boiler-plate text on the master slide (0
+ // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+ //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+ String txt = run.getText();
+ if (txt != null) {
+ xhtml.characters(txt);
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ }
+ }
+ }
+ }
private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml)
throws TikaException, SAXException, IOException {
@@ -262,60 +262,60 @@ public class HSLFExtractor extends Abstr
}
handleEmbeddedResource(
- TikaInputStream.get(pic.getData()), null, null,
- mediaType, xhtml, false);
+ TikaInputStream.get(pic.getData()), null, null,
+ mediaType, xhtml, false);
}
}
private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml)
- throws TikaException, SAXException, IOException {
- Shape[] shapes;
- try {
- shapes = slide.getShapes();
- } catch(NullPointerException e) {
- // Sometimes HSLF hits problems
- // Please open POI bugs for any you come across!
- return;
- }
-
- for( Shape shape : shapes ) {
- if( shape instanceof OLEShape ) {
- OLEShape oleShape = (OLEShape)shape;
- ObjectData data = null;
- try {
- data = oleShape.getObjectData();
- } catch( NullPointerException e ) {
+ throws TikaException, SAXException, IOException {
+ Shape[] shapes;
+ try {
+ shapes = slide.getShapes();
+ } catch (NullPointerException e) {
+ // Sometimes HSLF hits problems
+ // Please open POI bugs for any you come across!
+ return;
+ }
+
+ for (Shape shape : shapes) {
+ if (shape instanceof OLEShape) {
+ OLEShape oleShape = (OLEShape) shape;
+ ObjectData data = null;
+ try {
+ data = oleShape.getObjectData();
+ } catch (NullPointerException e) {
/* getObjectData throws NPE some times. */
+ }
+
+ if (data != null) {
+ String objID = Integer.toString(oleShape.getObjectID());
+
+ // Embedded Object: add a <div
+ // class="embedded" id="X"/> so consumer can see where
+ // in the main text each embedded document
+ // occurred:
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", objID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ TikaInputStream stream =
+ TikaInputStream.get(data.getData());
+ try {
+ String mediaType = null;
+ if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+ mediaType = "application/vnd.ms-excel";
+ }
+ handleEmbeddedResource(
+ stream, objID, objID,
+ mediaType, xhtml, false);
+ } finally {
+ stream.close();
+ }
+ }
}
-
- if (data != null) {
- String objID = Integer.toString(oleShape.getObjectID());
-
- // Embedded Object: add a <div
- // class="embedded" id="X"/> so consumer can see where
- // in the main text each embedded document
- // occurred:
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", objID);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
-
- TikaInputStream stream =
- TikaInputStream.get(data.getData());
- try {
- String mediaType = null;
- if ("Excel.Chart.8".equals(oleShape.getProgID())) {
- mediaType = "application/vnd.ms-excel";
- }
- handleEmbeddedResource(
- stream, objID, objID,
- mediaType, xhtml, false);
- } finally {
- stream.close();
- }
- }
- }
- }
- }
+ }
+ }
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java Fri May 29 14:36:21 2015
@@ -71,7 +71,7 @@ public class ListManager extends Abstrac
ListData listData = listTables.getListData(paragraph.getList().getLsid());
LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length];
for (int i = 0; i < listData.getLevels().length; i++) {
- levelTuples[i] = buildTuple(i,listData.getLevels()[i]);
+ levelTuples[i] = buildTuple(i, listData.getLevels()[i]);
}
lc = new ParagraphLevelCounter(levelTuples);
}
@@ -89,7 +89,7 @@ public class ListManager extends Abstrac
boolean isLegal = false;
int start = 1;
int restart = -1;
- String lvlText = "%"+i+".";
+ String lvlText = "%" + i + ".";
String numFmt = "decimal";
start = listLevel.getStartAt();
@@ -127,18 +127,18 @@ public class ListManager extends Abstrac
StringBuilder sb = new StringBuilder();
int last = 0;
- for (int i = 0; i < numberOffsets.length;i++) {
- int offset = (int)numberOffsets[i];
+ for (int i = 0; i < numberOffsets.length; i++) {
+ int offset = (int) numberOffsets[i];
- if (offset == 0){
+ if (offset == 0) {
break;
}
- sb.append(numberText.substring(last, offset-1));
+ sb.append(numberText.substring(last, offset - 1));
//need to add one because newer format
//adds one. In .doc, this was the array index;
//but in .docx, this is the level number
- int lvlNum = (int)numberText.charAt(offset-1)+1;
- sb.append("%"+lvlNum);
+ int lvlNum = (int) numberText.charAt(offset - 1) + 1;
+ sb.append("%" + lvlNum);
last = offset;
}
if (last < numberText.length()) {
@@ -149,29 +149,29 @@ public class ListManager extends Abstrac
private String convertToNewNumFormat(int numberFormat) {
switch (numberFormat) {
- case -1 :
+ case -1:
return "none";
- case 0 :
+ case 0:
return "decimal";
- case 1 :
+ case 1:
return "upperRoman";
- case 2 :
+ case 2:
return "lowerRoman";
- case 3 :
+ case 3:
return "upperLetter";
- case 4 :
+ case 4:
return "lowerLetter";
- case 5 :
+ case 5:
return "ordinal";
- case 22 :
+ case 22:
return "decimalZero";
- case 23 :
+ case 23:
return "bullet";
- case 47 :
+ case 47:
return "none";
- default :
+ default:
//do we really want to silently swallow these uncovered cases?
- throw new RuntimeException("NOT COVERED: "+numberFormat);
+ throw new RuntimeException("NOT COVERED: " + numberFormat);
}
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Fri May 29 14:36:21 2015
@@ -55,7 +55,9 @@ import org.xml.sax.SAXException;
*/
public class OfficeParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 7393462244028653479L;
private static final Set<MediaType> SUPPORTED_TYPES =
@@ -75,64 +77,7 @@ public class OfficeParser extends Abstra
POIFSDocumentType.SOLIDWORKS_PART.type,
POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
POIFSDocumentType.SOLIDWORKS_DRAWING.type
- )));
-
- public enum POIFSDocumentType {
- WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
- OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
- COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
- WORDDOCUMENT("doc", MediaType.application("msword")),
- UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
- ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
- POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
- PUBLISHER("pub", MediaType.application("x-mspublisher")),
- PROJECT("mpp", MediaType.application("vnd.ms-project")),
- VISIO("vsd", MediaType.application("vnd.visio")),
- WORKS("wps", MediaType.application("vnd.ms-works")),
- XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
- OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
- SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
- SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
- SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
-
- private final String extension;
- private final MediaType type;
-
- POIFSDocumentType(String extension, MediaType type) {
- this.extension = extension;
- this.type = type;
- }
-
- public String getExtension() {
- return extension;
- }
-
- public MediaType getType() {
- return type;
- }
-
- public static POIFSDocumentType detectType(POIFSFileSystem fs) {
- return detectType(fs.getRoot());
- }
-
- public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
- return detectType(fs.getRoot());
- }
-
- public static POIFSDocumentType detectType(DirectoryEntry node) {
- Set<String> names = new HashSet<String>();
- for (Entry entry : node) {
- names.add(entry.getName());
- }
- MediaType type = POIFSContainerDetector.detect(names, node);
- for (POIFSDocumentType poifsType : values()) {
- if (type.equals(poifsType.type)) {
- return poifsType;
- }
- }
- return UNKNOWN;
- }
- }
+ )));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
@@ -183,84 +128,84 @@ public class OfficeParser extends Abstra
// Parse remaining document entries
POIFSDocumentType type = POIFSDocumentType.detectType(root);
- if (type!=POIFSDocumentType.UNKNOWN) {
+ if (type != POIFSDocumentType.UNKNOWN) {
setType(metadata, type.getType());
}
switch (type) {
- case SOLIDWORKS_PART:
- case SOLIDWORKS_ASSEMBLY:
- case SOLIDWORKS_DRAWING:
- break;
- case PUBLISHER:
- PublisherTextExtractor publisherTextExtractor =
- new PublisherTextExtractor(root);
- xhtml.element("p", publisherTextExtractor.getText());
- break;
- case WORDDOCUMENT:
- new WordExtractor(context).parse(root, xhtml);
- break;
- case POWERPOINT:
- new HSLFExtractor(context).parse(root, xhtml);
- break;
- case WORKBOOK:
- case XLR:
- Locale locale = context.get(Locale.class, Locale.getDefault());
- new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
- break;
- case PROJECT:
- // We currently can't do anything beyond the metadata
- break;
- case VISIO:
- VisioTextExtractor visioTextExtractor =
- new VisioTextExtractor(root);
- for (String text : visioTextExtractor.getAllText()) {
- xhtml.element("p", text);
- }
- break;
- case OUTLOOK:
- OutlookExtractor extractor =
- new OutlookExtractor(root, context);
-
- extractor.parse(xhtml, metadata);
- break;
- case ENCRYPTED:
- EncryptionInfo info = new EncryptionInfo(root);
- Decryptor d = Decryptor.getInstance(info);
-
- try {
- // By default, use the default Office Password
- String password = Decryptor.DEFAULT_PASSWORD;
-
- // If they supplied a Password Provider, ask that for the password,
- // and use the provider given one if available (stick with default if not)
- PasswordProvider passwordProvider = context.get(PasswordProvider.class);
- if (passwordProvider != null) {
- String suppliedPassword = passwordProvider.getPassword(metadata);
- if (suppliedPassword != null) {
- password = suppliedPassword;
- }
- }
-
- // Check if we've the right password or not
- if (!d.verifyPassword(password)) {
- throw new EncryptedDocumentException();
- }
-
- // Decrypt the OLE2 stream, and delegate the resulting OOXML
- // file to the regular OOXML parser for normal handling
- OOXMLParser parser = new OOXMLParser();
-
- parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
- new BodyContentHandler(xhtml)),
- metadata, context);
- } catch (GeneralSecurityException ex) {
- throw new EncryptedDocumentException(ex);
- }
- default:
- // For unsupported / unhandled types, just the metadata
- // is extracted, which happened above
- break;
+ case SOLIDWORKS_PART:
+ case SOLIDWORKS_ASSEMBLY:
+ case SOLIDWORKS_DRAWING:
+ break;
+ case PUBLISHER:
+ PublisherTextExtractor publisherTextExtractor =
+ new PublisherTextExtractor(root);
+ xhtml.element("p", publisherTextExtractor.getText());
+ break;
+ case WORDDOCUMENT:
+ new WordExtractor(context).parse(root, xhtml);
+ break;
+ case POWERPOINT:
+ new HSLFExtractor(context).parse(root, xhtml);
+ break;
+ case WORKBOOK:
+ case XLR:
+ Locale locale = context.get(Locale.class, Locale.getDefault());
+ new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
+ break;
+ case PROJECT:
+ // We currently can't do anything beyond the metadata
+ break;
+ case VISIO:
+ VisioTextExtractor visioTextExtractor =
+ new VisioTextExtractor(root);
+ for (String text : visioTextExtractor.getAllText()) {
+ xhtml.element("p", text);
+ }
+ break;
+ case OUTLOOK:
+ OutlookExtractor extractor =
+ new OutlookExtractor(root, context);
+
+ extractor.parse(xhtml, metadata);
+ break;
+ case ENCRYPTED:
+ EncryptionInfo info = new EncryptionInfo(root);
+ Decryptor d = Decryptor.getInstance(info);
+
+ try {
+ // By default, use the default Office Password
+ String password = Decryptor.DEFAULT_PASSWORD;
+
+ // If they supplied a Password Provider, ask that for the password,
+ // and use the provider given one if available (stick with default if not)
+ PasswordProvider passwordProvider = context.get(PasswordProvider.class);
+ if (passwordProvider != null) {
+ String suppliedPassword = passwordProvider.getPassword(metadata);
+ if (suppliedPassword != null) {
+ password = suppliedPassword;
+ }
+ }
+
+ // Check if we've the right password or not
+ if (!d.verifyPassword(password)) {
+ throw new EncryptedDocumentException();
+ }
+
+ // Decrypt the OLE2 stream, and delegate the resulting OOXML
+ // file to the regular OOXML parser for normal handling
+ OOXMLParser parser = new OOXMLParser();
+
+ parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ metadata, context);
+ } catch (GeneralSecurityException ex) {
+ throw new EncryptedDocumentException(ex);
+ }
+ default:
+ // For unsupported / unhandled types, just the metadata
+ // is extracted, which happened above
+ break;
}
}
@@ -268,4 +213,61 @@ public class OfficeParser extends Abstra
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
+ public enum POIFSDocumentType {
+ WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
+ OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
+ COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
+ WORDDOCUMENT("doc", MediaType.application("msword")),
+ UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
+ ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
+ POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
+ PUBLISHER("pub", MediaType.application("x-mspublisher")),
+ PROJECT("mpp", MediaType.application("vnd.ms-project")),
+ VISIO("vsd", MediaType.application("vnd.visio")),
+ WORKS("wps", MediaType.application("vnd.ms-works")),
+ XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
+ OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
+ SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
+ SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
+ SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
+
+ private final String extension;
+ private final MediaType type;
+
+ POIFSDocumentType(String extension, MediaType type) {
+ this.extension = extension;
+ this.type = type;
+ }
+
+ public static POIFSDocumentType detectType(POIFSFileSystem fs) {
+ return detectType(fs.getRoot());
+ }
+
+ public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
+ return detectType(fs.getRoot());
+ }
+
+ public static POIFSDocumentType detectType(DirectoryEntry node) {
+ Set<String> names = new HashSet<String>();
+ for (Entry entry : node) {
+ names.add(entry.getName());
+ }
+ MediaType type = POIFSContainerDetector.detect(names, node);
+ for (POIFSDocumentType poifsType : values()) {
+ if (type.equals(poifsType.type)) {
+ return poifsType;
+ }
+ }
+ return UNKNOWN;
+ }
+
+ public String getExtension() {
+ return extension;
+ }
+
+ public MediaType getType() {
+ return type;
+ }
+ }
+
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java Fri May 29 14:36:21 2015
@@ -37,53 +37,28 @@ import org.xml.sax.SAXException;
/**
* A POI-powered Tika Parser for very old versions of Excel, from
- * pre-OLE2 days, such as Excel 4.
+ * pre-OLE2 days, such as Excel 4.
*/
public class OldExcelParser extends AbstractParser {
- private static final long serialVersionUID = 4611820730372823452L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.ms-excel.sheet.4"),
- MediaType.application("vnd.ms-excel.workspace.4"),
- MediaType.application("vnd.ms-excel.sheet.3"),
- MediaType.application("vnd.ms-excel.workspace.3"),
- MediaType.application("vnd.ms-excel.sheet.2")
- )));
+ private static final long serialVersionUID = 4611820730372823452L;
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-excel.sheet.4"),
+ MediaType.application("vnd.ms-excel.workspace.4"),
+ MediaType.application("vnd.ms-excel.sheet.3"),
+ MediaType.application("vnd.ms-excel.workspace.3"),
+ MediaType.application("vnd.ms-excel.sheet.2")
+ )));
- /**
- * Extracts properties and text from an MS Document input stream
- */
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // Open the POI provided extractor
- OldExcelExtractor extractor = new OldExcelExtractor(stream);
-
- // We can't do anything about metadata, as these old formats
- // didn't have any stored with them
-
- // Set the content type
- // TODO Get the version and type, to set as the Content Type
-
- // Have the text extracted and given to our Content Handler
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- parse(extractor, xhtml);
- }
-
- protected static void parse(OldExcelExtractor extractor,
- XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
+ protected static void parse(OldExcelExtractor extractor,
+ XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
// Get the whole text, as a single string
String text = extractor.getText();
-
+
// Split and output
xhtml.startDocument();
-
+
String line;
BufferedReader reader = new BufferedReader(new StringReader(text));
while ((line = reader.readLine()) != null) {
@@ -91,7 +66,32 @@ public class OldExcelParser extends Abst
xhtml.characters(line);
xhtml.endElement("p");
}
-
+
xhtml.endDocument();
}
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Open the POI provided extractor
+ OldExcelExtractor extractor = new OldExcelExtractor(stream);
+
+ // We can't do anything about metadata, as these old formats
+ // didn't have any stored with them
+
+ // Set the content type
+ // TODO Get the version and type, to set as the Content Type
+
+ // Have the text extracted and given to our Content Handler
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ parse(extractor, xhtml);
+ }
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Fri May 29 14:36:21 2015
@@ -60,7 +60,7 @@ public class OutlookExtractor extends Ab
public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
super(context);
-
+
try {
this.msg = new MAPIMessage(root);
} catch (IOException e) {
@@ -71,185 +71,187 @@ public class OutlookExtractor extends Ab
public void parse(XHTMLContentHandler xhtml, Metadata metadata)
throws TikaException, SAXException, IOException {
try {
- msg.setReturnNullOnMissingChunk(true);
-
- // If the message contains strings that aren't stored
- // as Unicode, try to sort out an encoding for them
- if(msg.has7BitEncodingStrings()) {
- if(msg.getHeaders() != null) {
- // There's normally something in the headers
- msg.guess7BitEncoding();
- } else {
- // Nothing in the header, try encoding detection
- // on the message body
- StringChunk text = msg.getMainChunks().textBodyChunk;
- if(text != null) {
- CharsetDetector detector = new CharsetDetector();
- detector.setText( text.getRawValue() );
- CharsetMatch match = detector.detect();
- if(match.getConfidence() > 35) {
- msg.set7BitEncoding( match.getName() );
+ msg.setReturnNullOnMissingChunk(true);
+
+ // If the message contains strings that aren't stored
+ // as Unicode, try to sort out an encoding for them
+ if (msg.has7BitEncodingStrings()) {
+ if (msg.getHeaders() != null) {
+ // There's normally something in the headers
+ msg.guess7BitEncoding();
+ } else {
+ // Nothing in the header, try encoding detection
+ // on the message body
+ StringChunk text = msg.getMainChunks().textBodyChunk;
+ if (text != null) {
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(text.getRawValue());
+ CharsetMatch match = detector.detect();
+ if (match.getConfidence() > 35) {
+ msg.set7BitEncoding(match.getName());
+ }
}
- }
- }
- }
-
- // Start with the metadata
- String subject = msg.getSubject();
- String from = msg.getDisplayFrom();
-
- metadata.set(TikaCoreProperties.CREATOR, from);
- metadata.set(Metadata.MESSAGE_FROM, from);
- metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
- metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
- metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
-
- metadata.set(TikaCoreProperties.TITLE, subject);
- // TODO: Move to description in Tika 2.0
- metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
- msg.getConversationTopic());
-
- try {
- for(String recipientAddress : msg.getRecipientEmailAddressList()) {
- if(recipientAddress != null)
- metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
- }
- } catch(ChunkNotFoundException he) {} // Will be fixed in POI 3.7 Final
-
- // Date - try two ways to find it
- // First try via the proper chunk
- if(msg.getMessageDate() != null) {
- metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
- metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
- } else {
- try {
- // Failing that try via the raw headers
- String[] headers = msg.getHeaders();
- if(headers != null && headers.length > 0) {
- for(String header: headers) {
- if(header.toLowerCase(Locale.ROOT).startsWith("date:")) {
- String date = header.substring(header.indexOf(':')+1).trim();
-
- // See if we can parse it as a normal mail date
- try {
- Date d = MboxParser.parseDate(date);
- metadata.set(TikaCoreProperties.CREATED, d);
- metadata.set(TikaCoreProperties.MODIFIED, d);
- } catch(ParseException e) {
- // Store it as-is, and hope for the best...
- metadata.set(TikaCoreProperties.CREATED, date);
- metadata.set(TikaCoreProperties.MODIFIED, date);
+ }
+ }
+
+ // Start with the metadata
+ String subject = msg.getSubject();
+ String from = msg.getDisplayFrom();
+
+ metadata.set(TikaCoreProperties.CREATOR, from);
+ metadata.set(Metadata.MESSAGE_FROM, from);
+ metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+ metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+ metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+ metadata.set(TikaCoreProperties.TITLE, subject);
+ // TODO: Move to description in Tika 2.0
+ metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+ msg.getConversationTopic());
+
+ try {
+ for (String recipientAddress : msg.getRecipientEmailAddressList()) {
+ if (recipientAddress != null)
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
+ }
+ } catch (ChunkNotFoundException he) {
+ } // Will be fixed in POI 3.7 Final
+
+ // Date - try two ways to find it
+ // First try via the proper chunk
+ if (msg.getMessageDate() != null) {
+ metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
+ metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
+ } else {
+ try {
+ // Failing that try via the raw headers
+ String[] headers = msg.getHeaders();
+ if (headers != null && headers.length > 0) {
+ for (String header : headers) {
+ if (header.toLowerCase(Locale.ROOT).startsWith("date:")) {
+ String date = header.substring(header.indexOf(':') + 1).trim();
+
+ // See if we can parse it as a normal mail date
+ try {
+ Date d = MboxParser.parseDate(date);
+ metadata.set(TikaCoreProperties.CREATED, d);
+ metadata.set(TikaCoreProperties.MODIFIED, d);
+ } catch (ParseException e) {
+ // Store it as-is, and hope for the best...
+ metadata.set(TikaCoreProperties.CREATED, date);
+ metadata.set(TikaCoreProperties.MODIFIED, date);
+ }
+ break;
}
- break;
}
- }
- }
- } catch(ChunkNotFoundException he) {
- // We can't find the date, sorry...
- }
- }
-
-
- xhtml.element("h1", subject);
-
- // Output the from and to details in text, as you
- // often want them in text form for searching
- xhtml.startElement("dl");
- if (from!=null) {
- header(xhtml, "From", from);
- }
- header(xhtml, "To", msg.getDisplayTo());
- header(xhtml, "Cc", msg.getDisplayCC());
- header(xhtml, "Bcc", msg.getDisplayBCC());
- try {
- header(xhtml, "Recipients", msg.getRecipientEmailAddress());
- } catch(ChunkNotFoundException e) {}
- xhtml.endElement("dl");
-
- // Get the message body. Preference order is: html, rtf, text
- Chunk htmlChunk = null;
- Chunk rtfChunk = null;
- Chunk textChunk = null;
- for(Chunk chunk : msg.getMainChunks().getChunks()) {
- if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
- htmlChunk = chunk;
- }
- if(chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
- rtfChunk = chunk;
- }
- if(chunk.getChunkId() == MAPIProperty.BODY.id) {
- textChunk = chunk;
- }
- }
-
- boolean doneBody = false;
- xhtml.startElement("div", "class", "message-body");
- if(htmlChunk != null) {
- byte[] data = null;
- if(htmlChunk instanceof ByteChunk) {
- data = ((ByteChunk)htmlChunk).getValue();
- } else if(htmlChunk instanceof StringChunk) {
- data = ((StringChunk)htmlChunk).getRawValue();
- }
- if(data != null) {
- HtmlParser htmlParser = new HtmlParser();
- htmlParser.parse(
- new ByteArrayInputStream(data),
- new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
- new Metadata(), new ParseContext()
- );
- doneBody = true;
- }
- }
- if(rtfChunk != null && !doneBody) {
- ByteChunk chunk = (ByteChunk)rtfChunk;
- MAPIRtfAttribute rtf = new MAPIRtfAttribute(
- MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
- );
- RTFParser rtfParser = new RTFParser();
- rtfParser.parse(
- new ByteArrayInputStream(rtf.getData()),
- new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
- new Metadata(), new ParseContext());
- doneBody = true;
- }
- if(textChunk != null && !doneBody) {
- xhtml.element("p", ((StringChunk)textChunk).getValue());
- }
- xhtml.endElement("div");
-
- // Process the attachments
- for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
- xhtml.startElement("div", "class", "attachment-entry");
-
- String filename = null;
- if (attachment.attachLongFileName != null) {
- filename = attachment.attachLongFileName.getValue();
- } else if (attachment.attachFileName != null) {
- filename = attachment.attachFileName.getValue();
- }
- if (filename != null && filename.length() > 0) {
- xhtml.element("h1", filename);
- }
-
- if(attachment.attachData != null) {
- handleEmbeddedResource(
- TikaInputStream.get(attachment.attachData.getValue()),
- filename, null,
- null, xhtml, true
- );
- }
- if(attachment.attachmentDirectory != null) {
- handleEmbeddedOfficeDoc(
- attachment.attachmentDirectory.getDirectory(),
- xhtml
- );
- }
-
- xhtml.endElement("div");
- }
- } catch(ChunkNotFoundException e) {
- throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
+ }
+ } catch (ChunkNotFoundException he) {
+ // We can't find the date, sorry...
+ }
+ }
+
+
+ xhtml.element("h1", subject);
+
+ // Output the from and to details in text, as you
+ // often want them in text form for searching
+ xhtml.startElement("dl");
+ if (from != null) {
+ header(xhtml, "From", from);
+ }
+ header(xhtml, "To", msg.getDisplayTo());
+ header(xhtml, "Cc", msg.getDisplayCC());
+ header(xhtml, "Bcc", msg.getDisplayBCC());
+ try {
+ header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+ } catch (ChunkNotFoundException e) {
+ }
+ xhtml.endElement("dl");
+
+ // Get the message body. Preference order is: html, rtf, text
+ Chunk htmlChunk = null;
+ Chunk rtfChunk = null;
+ Chunk textChunk = null;
+ for (Chunk chunk : msg.getMainChunks().getChunks()) {
+ if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+ htmlChunk = chunk;
+ }
+ if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+ rtfChunk = chunk;
+ }
+ if (chunk.getChunkId() == MAPIProperty.BODY.id) {
+ textChunk = chunk;
+ }
+ }
+
+ boolean doneBody = false;
+ xhtml.startElement("div", "class", "message-body");
+ if (htmlChunk != null) {
+ byte[] data = null;
+ if (htmlChunk instanceof ByteChunk) {
+ data = ((ByteChunk) htmlChunk).getValue();
+ } else if (htmlChunk instanceof StringChunk) {
+ data = ((StringChunk) htmlChunk).getRawValue();
+ }
+ if (data != null) {
+ HtmlParser htmlParser = new HtmlParser();
+ htmlParser.parse(
+ new ByteArrayInputStream(data),
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ new Metadata(), new ParseContext()
+ );
+ doneBody = true;
+ }
+ }
+ if (rtfChunk != null && !doneBody) {
+ ByteChunk chunk = (ByteChunk) rtfChunk;
+ MAPIRtfAttribute rtf = new MAPIRtfAttribute(
+ MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
+ );
+ RTFParser rtfParser = new RTFParser();
+ rtfParser.parse(
+ new ByteArrayInputStream(rtf.getData()),
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ new Metadata(), new ParseContext());
+ doneBody = true;
+ }
+ if (textChunk != null && !doneBody) {
+ xhtml.element("p", ((StringChunk) textChunk).getValue());
+ }
+ xhtml.endElement("div");
+
+ // Process the attachments
+ for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ xhtml.startElement("div", "class", "attachment-entry");
+
+ String filename = null;
+ if (attachment.attachLongFileName != null) {
+ filename = attachment.attachLongFileName.getValue();
+ } else if (attachment.attachFileName != null) {
+ filename = attachment.attachFileName.getValue();
+ }
+ if (filename != null && filename.length() > 0) {
+ xhtml.element("h1", filename);
+ }
+
+ if (attachment.attachData != null) {
+ handleEmbeddedResource(
+ TikaInputStream.get(attachment.attachData.getValue()),
+ filename, null,
+ null, xhtml, true
+ );
+ }
+ if (attachment.attachmentDirectory != null) {
+ handleEmbeddedOfficeDoc(
+ attachment.attachmentDirectory.getDirectory(),
+ xhtml
+ );
+ }
+
+ xhtml.endElement("div");
+ }
+ } catch (ChunkNotFoundException e) {
+ throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Fri May 29 14:36:21 2015
@@ -40,162 +40,135 @@ import org.apache.tika.mime.MediaType;
/**
* A detector that works on a POIFS OLE2 document
- * to figure out exactly what the file is.
+ * to figure out exactly what the file is.
* This should work for all OLE2 documents, whether
- * they are ones supported by POI or not.
+ * they are ones supported by POI or not.
*/
public class POIFSContainerDetector implements Detector {
- /** Serial version UID */
- private static final long serialVersionUID = -3028021741663605293L;
-
- /** An ASCII String "StarImpress" */
- private static final byte [] STAR_IMPRESS = new byte [] {
- 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
- };
-
- /** An ASCII String "StarDraw" */
- private static final byte [] STAR_DRAW = new byte [] {
- 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
- };
-
- /** An ASCII String "Quill96" for Works Files */
- private static final byte [] WORKS_QUILL96 = new byte[] {
- 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
- };
-
- /** The OLE base file format */
+ /**
+ * The OLE base file format
+ */
public static final MediaType OLE = application("x-tika-msoffice");
-
- /** The protected OOXML base file format */
+ /**
+ * The protected OOXML base file format
+ */
public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
-
- /** General embedded document type within an OLE2 container */
+ /**
+ * General embedded document type within an OLE2 container
+ */
public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
-
- /** An OLE10 Native embedded document within another OLE2 document */
+ /**
+ * An OLE10 Native embedded document within another OLE2 document
+ */
public static final MediaType OLE10_NATIVE =
new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
-
- /** Some other kind of embedded document, in a CompObj container within another OLE2 document */
+ /**
+ * Some other kind of embedded document, in a CompObj container within another OLE2 document
+ */
public static final MediaType COMP_OBJ =
new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
-
- /** Microsoft Excel */
+ /**
+ * Microsoft Excel
+ */
public static final MediaType XLS = application("vnd.ms-excel");
-
- /** Microsoft Word */
+ /**
+ * Microsoft Word
+ */
public static final MediaType DOC = application("msword");
-
- /** Microsoft PowerPoint */
+ /**
+ * Microsoft PowerPoint
+ */
public static final MediaType PPT = application("vnd.ms-powerpoint");
-
- /** Microsoft Publisher */
+ /**
+ * Microsoft Publisher
+ */
public static final MediaType PUB = application("x-mspublisher");
-
- /** Microsoft Visio */
+ /**
+ * Microsoft Visio
+ */
public static final MediaType VSD = application("vnd.visio");
-
- /** Microsoft Works */
+ /**
+ * Microsoft Works
+ */
public static final MediaType WPS = application("vnd.ms-works");
-
- /** Microsoft Works Spreadsheet 7.0 */
+ /**
+ * Microsoft Works Spreadsheet 7.0
+ */
public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
-
- /** Microsoft Outlook */
+ /**
+ * Microsoft Outlook
+ */
public static final MediaType MSG = application("vnd.ms-outlook");
-
- /** Microsoft Project */
+ /**
+ * Microsoft Project
+ */
public static final MediaType MPP = application("vnd.ms-project");
-
- /** StarOffice Calc */
+ /**
+ * StarOffice Calc
+ */
public static final MediaType SDC = application("vnd.stardivision.calc");
-
- /** StarOffice Draw */
+ /**
+ * StarOffice Draw
+ */
public static final MediaType SDA = application("vnd.stardivision.draw");
-
- /** StarOffice Impress */
+ /**
+ * StarOffice Impress
+ */
public static final MediaType SDD = application("vnd.stardivision.impress");
-
- /** StarOffice Writer */
+ /**
+ * StarOffice Writer
+ */
public static final MediaType SDW = application("vnd.stardivision.writer");
-
- /** SolidWorks CAD file */
+ /**
+ * SolidWorks CAD file
+ */
public static final MediaType SLDWORKS = application("sldworks");
-
- /** Regexp for matching the MPP Project Data stream */
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -3028021741663605293L;
+ /**
+ * An ASCII String "StarImpress"
+ */
+ private static final byte[] STAR_IMPRESS = new byte[]{
+ 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
+ };
+ /**
+ * An ASCII String "StarDraw"
+ */
+ private static final byte[] STAR_DRAW = new byte[]{
+ 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
+ };
+ /**
+ * An ASCII String "Quill96" for Works Files
+ */
+ private static final byte[] WORKS_QUILL96 = new byte[]{
+ 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
+ };
+ /**
+ * Regexp for matching the MPP Project Data stream
+ */
private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
- public MediaType detect(InputStream input, Metadata metadata)
- throws IOException {
- // Check if we have access to the document
- if (input == null) {
- return MediaType.OCTET_STREAM;
- }
-
- // If this is a TikaInputStream wrapping an already
- // parsed NPOIFileSystem/DirectoryNode, just get the
- // names from the root:
- TikaInputStream tis = TikaInputStream.cast(input);
- Set<String> names = null;
- if (tis != null) {
- Object container = tis.getOpenContainer();
- if (container instanceof NPOIFSFileSystem) {
- names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
- } else if (container instanceof DirectoryNode) {
- names = getTopLevelNames((DirectoryNode) container);
- }
- }
-
- if (names == null) {
- // Check if the document starts with the OLE header
- input.mark(8);
- try {
- if (input.read() != 0xd0 || input.read() != 0xcf
- || input.read() != 0x11 || input.read() != 0xe0
- || input.read() != 0xa1 || input.read() != 0xb1
- || input.read() != 0x1a || input.read() != 0xe1) {
- return MediaType.OCTET_STREAM;
- }
- } finally {
- input.reset();
- }
- }
-
- // We can only detect the exact type when given a TikaInputStream
- if (names == null && tis != null) {
- // Look for known top level entry names to detect the document type
- names = getTopLevelNames(tis);
- }
-
- // Detect based on the names (as available)
- if (tis != null &&
- tis.getOpenContainer() != null &&
- tis.getOpenContainer() instanceof NPOIFSFileSystem) {
- return detect(names, ((NPOIFSFileSystem)tis.getOpenContainer()).getRoot());
- } else {
- return detect(names, null);
- }
- }
-
/**
* Internal detection of the specific kind of OLE2 document, based on the
* names of the top level streams within the file.
- *
+ *
* @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
- * entry of the filesystem whose type is to be detected, as a
- * second argument.
+ * entry of the filesystem whose type is to be detected, as a
+ * second argument.
*/
protected static MediaType detect(Set<String> names) {
return detect(names, null);
}
-
+
/**
* Internal detection of the specific kind of OLE2 document, based on the
* names of the top-level streams within the file. In some cases the
* detection may need access to the root {@link DirectoryEntry} of that file
* for best results. The entry can be given as a second, optional argument.
- *
+ *
* @param names
* @param root
* @return
@@ -227,20 +200,20 @@ public class POIFSContainerDetector impl
// This check has to be before names.contains("Workbook")
// Works 7.0 spreadsheet files contain both
// we want to avoid classifying this as Excel
- return XLR;
+ return XLR;
} else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
return XLS;
} else if (names.contains("Book")) {
- // Excel 95 or older, we won't be able to parse this....
- return XLS;
- } else if (names.contains("EncryptedPackage") &&
+ // Excel 95 or older, we won't be able to parse this....
+ return XLS;
+ } else if (names.contains("EncryptedPackage") &&
names.contains("EncryptionInfo") &&
names.contains("\u0006DataSpaces")) {
// This is a protected OOXML document, which is an OLE2 file
// with an Encrypted Stream which holds the OOXML data
// Without decrypting the stream, we can't tell what kind of
// OOXML file we have. Return a general OOXML Protected type,
- // and hope the name based detection can guess the rest!
+ // and hope the name based detection can guess the rest!
return OOXML_PROTECTED;
} else if (names.contains("EncryptedPackage")) {
return OLE;
@@ -263,33 +236,33 @@ public class POIFSContainerDetector impl
} else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) {
return COMP_OBJ;
} else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
- // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
- // If we have the Directory, check
- if (root != null) {
- MediaType type = processCompObjFormatType(root);
- if (type == WPS) {
- return WPS;
- } else {
- // Assume it's a general CompObj embedded resource
- return COMP_OBJ;
- }
- } else {
- // Assume it's a general CompObj embedded resource
- return COMP_OBJ;
- }
+ // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
+ // If we have the Directory, check
+ if (root != null) {
+ MediaType type = processCompObjFormatType(root);
+ if (type == WPS) {
+ return WPS;
+ } else {
+ // Assume it's a general CompObj embedded resource
+ return COMP_OBJ;
+ }
+ } else {
+ // Assume it's a general CompObj embedded resource
+ return COMP_OBJ;
+ }
} else if (names.contains("CONTENTS")) {
- // CONTENTS without SPELLING nor CompObj normally means some sort
- // of embedded non-office file inside an OLE2 document
- // This is most commonly triggered on nested directories
- return OLE;
+ // CONTENTS without SPELLING nor CompObj normally means some sort
+ // of embedded non-office file inside an OLE2 document
+ // This is most commonly triggered on nested directories
+ return OLE;
} else if (names.contains("\u0001CompObj") &&
- (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
- // Could be Project, look for common name patterns
- for (String name : names) {
- if (mppDataMatch.matcher(name).matches()) {
- return MPP;
- }
- }
+ (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
+ // Could be Project, look for common name patterns
+ for (String name : names) {
+ if (mppDataMatch.matcher(name).matches()) {
+ return MPP;
+ }
+ }
} else if (names.contains("PerfectOffice_MAIN")) {
if (names.contains("SlideShow")) {
return MediaType.application("x-corelpresentations"); // .shw
@@ -313,36 +286,36 @@ public class POIFSContainerDetector impl
/**
* Is this one of the kinds of formats which uses CompObj to
- * store all of their data, eg Star Draw, Star Impress or
- * (older) Works?
+ * store all of their data, eg Star Draw, Star Impress or
+ * (older) Works?
* If not, it's likely an embedded resource
*/
private static MediaType processCompObjFormatType(DirectoryEntry root) {
try {
Entry e = root.getEntry("\u0001CompObj");
if (e != null && e.isDocumentEntry()) {
- DocumentNode dn = (DocumentNode)e;
+ DocumentNode dn = (DocumentNode) e;
DocumentInputStream stream = new DocumentInputStream(dn);
- byte [] bytes = IOUtils.toByteArray(stream);
+ byte[] bytes = IOUtils.toByteArray(stream);
/*
* This array contains a string with a normal ASCII name of the
* application used to create this file. We want to search for that
* name.
*/
- if ( arrayContains(bytes, STAR_DRAW) ) {
+ if (arrayContains(bytes, STAR_DRAW)) {
return SDA;
} else if (arrayContains(bytes, STAR_IMPRESS)) {
return SDD;
} else if (arrayContains(bytes, WORKS_QUILL96)) {
- return WPS;
+ return WPS;
}
- }
+ }
} catch (Exception e) {
/*
* "root.getEntry" can throw FileNotFoundException. The code inside
* "if" can throw IOExceptions. Theoretically. Practically no
* exceptions will likely ever appear.
- *
+ *
* Swallow all of them. If any occur, we just assume that we can't
* distinguish between Draw and Impress and return something safe:
* x-tika-msoffice
@@ -350,10 +323,10 @@ public class POIFSContainerDetector impl
}
return OLE;
}
-
+
// poor man's search for byte arrays, replace with some library call if
// you know one without adding new dependencies
- private static boolean arrayContains(byte [] larger, byte [] smaller) {
+ private static boolean arrayContains(byte[] larger, byte[] smaller) {
int largerCounter = 0;
int smallerCounter = 0;
while (largerCounter < larger.length) {
@@ -365,7 +338,7 @@ public class POIFSContainerDetector impl
}
} else {
largerCounter = largerCounter - smallerCounter + 1;
- smallerCounter=0;
+ smallerCounter = 0;
}
}
return false;
@@ -401,4 +374,56 @@ public class POIFSContainerDetector impl
}
return names;
}
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ // Check if we have access to the document
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ // If this is a TikaInputStream wrapping an already
+ // parsed NPOIFileSystem/DirectoryNode, just get the
+ // names from the root:
+ TikaInputStream tis = TikaInputStream.cast(input);
+ Set<String> names = null;
+ if (tis != null) {
+ Object container = tis.getOpenContainer();
+ if (container instanceof NPOIFSFileSystem) {
+ names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+ } else if (container instanceof DirectoryNode) {
+ names = getTopLevelNames((DirectoryNode) container);
+ }
+ }
+
+ if (names == null) {
+ // Check if the document starts with the OLE header
+ input.mark(8);
+ try {
+ if (input.read() != 0xd0 || input.read() != 0xcf
+ || input.read() != 0x11 || input.read() != 0xe0
+ || input.read() != 0xa1 || input.read() != 0xb1
+ || input.read() != 0x1a || input.read() != 0xe1) {
+ return MediaType.OCTET_STREAM;
+ }
+ } finally {
+ input.reset();
+ }
+ }
+
+ // We can only detect the exact type when given a TikaInputStream
+ if (names == null && tis != null) {
+ // Look for known top level entry names to detect the document type
+ names = getTopLevelNames(tis);
+ }
+
+ // Detect based on the names (as available)
+ if (tis != null &&
+ tis.getOpenContainer() != null &&
+ tis.getOpenContainer() instanceof NPOIFSFileSystem) {
+ return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
+ } else {
+ return detect(names, null);
+ }
+ }
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Fri May 29 14:36:21 2015
@@ -50,10 +50,10 @@ public class SummaryExtractor {
private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
private static final String SUMMARY_INFORMATION =
- SummaryInformation.DEFAULT_STREAM_NAME;
+ SummaryInformation.DEFAULT_STREAM_NAME;
private static final String DOCUMENT_SUMMARY_INFORMATION =
- DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+ DocumentSummaryInformation.DEFAULT_STREAM_NAME;
private final Metadata metadata;
@@ -77,9 +77,9 @@ public class SummaryExtractor {
throws IOException, TikaException {
try {
DocumentEntry entry =
- (DocumentEntry) root.getEntry(entryName);
+ (DocumentEntry) root.getEntry(entryName);
PropertySet properties =
- new PropertySet(new DocumentInputStream(entry));
+ new PropertySet(new DocumentInputStream(entry));
if (properties.isSummaryInformation()) {
parse(new SummaryInformation(properties));
}
@@ -115,7 +115,7 @@ public class SummaryExtractor {
set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted());
set(Metadata.EDIT_TIME, summary.getEditTime());
set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity());
-
+
// New style counts
set(Office.WORD_COUNT, summary.getWordCount());
set(Office.CHARACTER_COUNT, summary.getCharCount());
@@ -123,7 +123,7 @@ public class SummaryExtractor {
if (summary.getPageCount() > 0) {
metadata.set(PagedText.N_PAGES, summary.getPageCount());
}
-
+
// Old style, Tika 1.0 properties
// TODO Remove these in Tika 2.0
set(Metadata.TEMPLATE, summary.getTemplate());
@@ -140,7 +140,7 @@ public class SummaryExtractor {
set(OfficeOpenXMLExtended.MANAGER, summary.getManager());
set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
-
+
// New style counts
set(Office.SLIDE_COUNT, summary.getSlideCount());
if (summary.getSlideCount() > 0) {
@@ -152,7 +152,7 @@ public class SummaryExtractor {
set(Metadata.MANAGER, summary.getManager());
set(MSOffice.SLIDE_COUNT, summary.getSlideCount());
set(Metadata.CATEGORY, summary.getCategory());
-
+
parse(summary.getCustomProperties());
}
@@ -169,6 +169,7 @@ public class SummaryExtractor {
/**
* Attempt to parse custom document properties and add to the collection of metadata
+ *
* @param customProperties
*/
private void parse(CustomProperties customProperties) {
@@ -179,23 +180,23 @@ public class SummaryExtractor {
// Get, convert and save property value
Object value = customProperties.get(name);
- if (value instanceof String){
- set(key, (String)value);
+ if (value instanceof String) {
+ set(key, (String) value);
} else if (value instanceof Date) {
Property prop = Property.externalDate(key);
- metadata.set(prop, (Date)value);
+ metadata.set(prop, (Date) value);
} else if (value instanceof Boolean) {
Property prop = Property.externalBoolean(key);
metadata.set(prop, value.toString());
} else if (value instanceof Long) {
Property prop = Property.externalInteger(key);
- metadata.set(prop, ((Long)value).intValue());
+ metadata.set(prop, ((Long) value).intValue());
} else if (value instanceof Double) {
Property prop = Property.externalReal(key);
- metadata.set(prop, (Double)value);
+ metadata.set(prop, (Double) value);
} else if (value instanceof Integer) {
Property prop = Property.externalInteger(key);
- metadata.set(prop, ((Integer)value).intValue());
+ metadata.set(prop, ((Integer) value).intValue());
}
}
}
@@ -206,7 +207,7 @@ public class SummaryExtractor {
metadata.set(name, value);
}
}
-
+
private void set(Property property, String value) {
if (value != null) {
metadata.set(property, value);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java Fri May 29 14:36:21 2015
@@ -43,17 +43,17 @@ import org.xml.sax.SAXException;
/**
* A POI-powered Tika Parser for TNEF (Transport Neutral
- * Encoding Format) messages, aka winmail.dat
+ * Encoding Format) messages, aka winmail.dat
*/
public class TNEFParser extends AbstractParser {
- private static final long serialVersionUID = 4611820730372823452L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.ms-tnef"),
- MediaType.application("ms-tnef"),
- MediaType.application("x-tnef")
- )));
+ private static final long serialVersionUID = 4611820730372823452L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-tnef"),
+ MediaType.application("ms-tnef"),
+ MediaType.application("x-tnef")
+ )));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
@@ -66,70 +66,70 @@ public class TNEFParser extends Abstract
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
-
- // We work by recursing, so get the appropriate bits
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
- EmbeddedDocumentExtractor embeddedExtractor;
- if (ex==null) {
- embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
- } else {
- embeddedExtractor = ex;
- }
-
- // Ask POI to process the file for us
- HMEFMessage msg = new HMEFMessage(stream);
-
- // Set the message subject if known
- String subject = msg.getSubject();
- if(subject != null && subject.length() > 0) {
- // TODO: Move to title in Tika 2.0
- metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
- }
-
- // Recurse into the message body RTF
- MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
- if(attr != null && attr instanceof MAPIRtfAttribute) {
- MAPIRtfAttribute rtf = (MAPIRtfAttribute)attr;
- handleEmbedded(
- "message.rtf", "application/rtf",
- rtf.getData(),
- embeddedExtractor, handler
- );
- }
-
- // Recurse into each attachment in turn
- for(Attachment attachment : msg.getAttachments()) {
- String name = attachment.getLongFilename();
- if(name == null || name.length() == 0) {
- name = attachment.getFilename();
- }
- if(name == null || name.length() == 0) {
- String ext = attachment.getExtension();
- if(ext != null) {
- name = "unknown" + ext;
- }
- }
- handleEmbedded(
- name, null, attachment.getContents(),
- embeddedExtractor, handler
- );
- }
+
+ // We work by recursing, so get the appropriate bits
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+ EmbeddedDocumentExtractor embeddedExtractor;
+ if (ex == null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ } else {
+ embeddedExtractor = ex;
+ }
+
+ // Ask POI to process the file for us
+ HMEFMessage msg = new HMEFMessage(stream);
+
+ // Set the message subject if known
+ String subject = msg.getSubject();
+ if (subject != null && subject.length() > 0) {
+ // TODO: Move to title in Tika 2.0
+ metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
+ }
+
+ // Recurse into the message body RTF
+ MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
+ if (attr != null && attr instanceof MAPIRtfAttribute) {
+ MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
+ handleEmbedded(
+ "message.rtf", "application/rtf",
+ rtf.getData(),
+ embeddedExtractor, handler
+ );
+ }
+
+ // Recurse into each attachment in turn
+ for (Attachment attachment : msg.getAttachments()) {
+ String name = attachment.getLongFilename();
+ if (name == null || name.length() == 0) {
+ name = attachment.getFilename();
+ }
+ if (name == null || name.length() == 0) {
+ String ext = attachment.getExtension();
+ if (ext != null) {
+ name = "unknown" + ext;
+ }
+ }
+ handleEmbedded(
+ name, null, attachment.getContents(),
+ embeddedExtractor, handler
+ );
+ }
}
-
+
private void handleEmbedded(String name, String type, byte[] contents,
- EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- Metadata metadata = new Metadata();
- if(name != null)
- metadata.set(Metadata.RESOURCE_NAME_KEY, name);
- if(type != null)
- metadata.set(Metadata.CONTENT_TYPE, type);
-
- if (embeddedExtractor.shouldParseEmbedded(metadata)) {
- embeddedExtractor.parseEmbedded(
- TikaInputStream.get(contents),
- new EmbeddedContentHandler(handler),
- metadata, false);
- }
+ EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ Metadata metadata = new Metadata();
+ if (name != null)
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ if (type != null)
+ metadata.set(Metadata.CONTENT_TYPE, type);
+
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ TikaInputStream.get(contents),
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
}
}