You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 18:25:19 UTC
svn commit: r903329 - in /lucene/tika/trunk:
tika-core/src/main/java/org/apache/tika/metadata/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/
tika-parsers/src/test/r...
Author: jukka
Date: Tue Jan 26 17:25:18 2010
New Revision: 903329
URL: http://svn.apache.org/viewvc?rev=903329&view=rev
Log:
TIKA-364: [PATCH] Metadata mark for xlsx documents with protected sheets
Patch by Maxim Valyanskiy
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx (with props)
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java Tue Jan 26 17:25:18 2010
@@ -23,4 +23,5 @@
String RESOURCE_NAME_KEY = "resourceName";
+ String PROTECTED = "protected";
}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Tue Jan 26 17:25:18 2010
@@ -32,6 +32,9 @@
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.exception.TikaException;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.SAXException;
@@ -42,10 +45,14 @@
*/
private final DataFormatter formatter = new DataFormatter();
+ private final XSSFExcelExtractor extractor;
+ private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
public XSSFExcelExtractorDecorator(
XSSFExcelExtractor extractor, Locale locale) {
- super(extractor, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ super(extractor, TYPE);
+
+ this.extractor = extractor;
}
/**
@@ -129,4 +136,26 @@
xhtml.element("p", content);
}
}
+
+ @Override
+ public MetadataExtractor getMetadataExtractor() {
+ return new MetadataExtractor(extractor, TYPE) {
+ @Override
+ public void extract(Metadata metadata) throws TikaException {
+ super.extract(metadata);
+
+ metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+ XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+
+ for (int i = 0; i < document.getNumberOfSheets(); i++) {
+ XSSFSheet sheet = document.getSheetAt(i);
+
+ if (sheet.getProtect()) {
+ metadata.set(TikaMetadataKeys.PROTECTED, "true");
+ }
+ }
+ }
+ };
+ }
}
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Tue Jan 26 17:25:18 2010
@@ -22,6 +22,7 @@
import junit.framework.TestCase;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -67,6 +68,7 @@
assertFalse(content.contains("9.0"));
assertTrue(content.contains("196"));
assertFalse(content.contains("196.0"));
+ assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
} finally {
input.close();
}
@@ -190,4 +192,25 @@
}
}
+ public void testProtectedExcel() throws Exception {
+ InputStream input = OOXMLParserTest.class
+ .getResourceAsStream("/test-documents/protected.xlsx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
+ } finally {
+ input.close();
+ }
+ }
+
}
\ No newline at end of file
Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx?rev=903329&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream