You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 18:25:19 UTC

svn commit: r903329 - in /lucene/tika/trunk: tika-core/src/main/java/org/apache/tika/metadata/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/test/r...

Author: jukka
Date: Tue Jan 26 17:25:18 2010
New Revision: 903329

URL: http://svn.apache.org/viewvc?rev=903329&view=rev
Log:
TIKA-364: [PATCH] Metadata mark for xlsx documents with protected sheets

Patch by Maxim Valyanskiy

Added:
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx   (with props)
Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java Tue Jan 26 17:25:18 2010
@@ -23,4 +23,5 @@
 
     String RESOURCE_NAME_KEY = "resourceName";
 
+    String PROTECTED = "protected";
 }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Tue Jan 26 17:25:18 2010
@@ -32,6 +32,9 @@
 import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.exception.TikaException;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.SAXException;
 
@@ -42,10 +45,14 @@
      */
 	private final DataFormatter formatter = new DataFormatter();
 
+    private final XSSFExcelExtractor extractor;
+    private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
 
     public XSSFExcelExtractorDecorator(
             XSSFExcelExtractor extractor, Locale locale) {
-        super(extractor, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+        super(extractor, TYPE);
+
+        this.extractor = extractor;
     }
 
     /**
@@ -129,4 +136,26 @@
             xhtml.element("p", content);
         }
     }
+
+    @Override
+    public MetadataExtractor getMetadataExtractor() {
+        return new MetadataExtractor(extractor, TYPE) {
+            @Override
+            public void extract(Metadata metadata) throws TikaException {
+                super.extract(metadata);
+
+                metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+                XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+
+                for (int i = 0; i < document.getNumberOfSheets(); i++) {
+                    XSSFSheet sheet = document.getSheetAt(i);
+
+                    if (sheet.getProtect()) {
+                        metadata.set(TikaMetadataKeys.PROTECTED, "true");
+                    }
+                }
+            }
+        };
+    }
 }

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=903329&r1=903328&r2=903329&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Tue Jan 26 17:25:18 2010
@@ -22,6 +22,7 @@
 import junit.framework.TestCase;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
@@ -67,6 +68,7 @@
             assertFalse(content.contains("9.0"));
             assertTrue(content.contains("196"));
             assertFalse(content.contains("196.0"));
+            assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
         } finally {
             input.close();
         }
@@ -190,4 +192,25 @@
         }
     }
 
+    public void testProtectedExcel() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/protected.xlsx");
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        try {
+            parser.parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
+        } finally {
+            input.close();
+        }
+    }
+
 }
\ No newline at end of file

Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx?rev=903329&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/protected.xlsx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream