You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by vi...@apache.org on 2015/10/19 08:26:58 UTC

svn commit: r1709361 - in /poi/trunk: src/integrationtest/org/apache/poi/ src/integrationtest/org/apache/poi/stress/ src/ooxml/java/org/apache/poi/extractor/ src/ooxml/java/org/apache/poi/xdgf/extractor/ src/ooxml/java/org/apache/poi/xdgf/usermodel/ sr...

Author: virtuald
Date: Mon Oct 19 06:26:57 2015
New Revision: 1709361

URL: http://svn.apache.org/viewvc?rev=1709361&view=rev
Log:
Add Visio OOXML text extractor + tests

Added:
    poi/trunk/src/ooxml/java/org/apache/poi/xdgf/extractor/
    poi/trunk/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java
    poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/xdgf/extractor/
    poi/trunk/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java
    poi/trunk/test-data/diagram/test_text_extraction.vsdx
Modified:
    poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java
    poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java

Modified: poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java?rev=1709361&r1=1709360&r2=1709361&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java Mon Oct 19 06:26:57 2015
@@ -105,7 +105,7 @@ public class TestAllFiles {
         // Visio - binary
         HANDLERS.put(".vsd", new HDGFFileHandler());
         
-        // Visio - ooxml (currently unsupported)
+        // Visio - ooxml
         HANDLERS.put(".vsdm", new XDGFFileHandler());
         HANDLERS.put(".vsdx", new XDGFFileHandler());
         HANDLERS.put(".vssm", new XDGFFileHandler());

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java?rev=1709361&r1=1709360&r2=1709361&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java Mon Oct 19 06:26:57 2015
@@ -16,19 +16,11 @@
 ==================================================================== */
 package org.apache.poi.stress;
 
-import java.io.File;
-import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
 
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
-import org.apache.poi.util.PackageHelper;
+import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
 import org.junit.Test;
 
 public class XDGFFileHandler extends AbstractFileHandler {
@@ -37,39 +29,19 @@ public class XDGFFileHandler extends Abs
         // ignore password protected files
         if (POIXMLDocumentHandler.isEncrypted(stream)) return;
 
-        TestXDGFXMLDocument doc = new TestXDGFXMLDocument(stream);
+        XmlVisioDocument doc = new XmlVisioDocument(stream);
         new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
     }
-
-    @Override
-    public void handleExtracting(File file) throws Exception {
-        // TODO: extraction/actual operations not supported yet
-    }
-
+    
     // a test-case to test this locally without executing the full TestAllFiles
     @Test
     public void test() throws Exception {
         OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ);
         try {
-            TestXDGFXMLDocument doc = new TestXDGFXMLDocument(pkg);
+            XmlVisioDocument doc = new XmlVisioDocument(pkg);
             new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
         } finally {
             pkg.close();
         }
     }
-
-    // TODO: Get rid of this when full visio ooxml support is added
-    private final static class TestXDGFXMLDocument extends POIXMLDocument {
-        public TestXDGFXMLDocument(OPCPackage pkg) {
-            super(pkg, PackageRelationshipTypes.VISIO_CORE_DOCUMENT);
-        }
-
-        public TestXDGFXMLDocument(InputStream is) throws IOException {
-            this(PackageHelper.open(is));
-        }
-
-        public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
-            return new ArrayList<PackagePart>();
-        }
-    }
 }
\ No newline at end of file

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=1709361&r1=1709360&r2=1709361&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Mon Oct 19 06:26:57 2015
@@ -55,6 +55,7 @@ import org.apache.poi.poifs.filesystem.N
 import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@@ -172,11 +173,9 @@ public class ExtractorFactory {
        }
        if (core.size() == 0) {
            // Could it be a visio one?
-           PackageRelationshipCollection visio =
-                   pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
-           if (visio.size() == 1) {
-               throw new IllegalArgumentException("Text extraction not supported for Visio OOXML files");
-           }
+           core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+           if (core.size() == 1)
+               return new XDGFVisioExtractor(pkg);
        }
        
        // Should just be a single core document, complain if not

Added: poi/trunk/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java?rev=1709361&view=auto
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java (added)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java Mon Oct 19 06:26:57 2015
@@ -0,0 +1,51 @@
+package org.apache.poi.xdgf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.xdgf.usermodel.XDGFPage;
+import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
+import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor;
+
+/**
+ * Helper class to extract text from an OOXML Visio File
+ */
+public class XDGFVisioExtractor extends POIXMLTextExtractor {
+
+    protected final XmlVisioDocument document;
+    
+    public XDGFVisioExtractor(XmlVisioDocument document) {
+        super(document);
+        this.document = document;
+    }
+
+    public XDGFVisioExtractor(OPCPackage openPackage) throws IOException {
+        this(new XmlVisioDocument(openPackage));
+    }
+
+    public String getText() {
+        ShapeTextVisitor visitor = new ShapeTextVisitor();
+        
+        for (XDGFPage page: document.getPages()) {
+            page.getContent().visitShapes(visitor);
+        }
+        
+        return visitor.getText().toString();
+    }
+    
+    public static void main(String [] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Use:");
+            System.err.println("  XDGFVisioExtractor <filename.vsdx>");
+            System.exit(1);
+        }
+        POIXMLTextExtractor extractor =
+                new XDGFVisioExtractor(POIXMLDocument.openPackage(
+                        args[0]
+                ));
+        System.out.println(extractor.getText());
+        extractor.close();
+    }
+}

Modified: poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java?rev=1709361&r1=1709360&r2=1709361&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java Mon Oct 19 06:26:57 2015
@@ -29,6 +29,9 @@ import com.microsoft.schemas.office.visi
 
 /**
  * Represents the root document: /visio/document.xml
+ * 
+ * You're probably actually looking for {@link XmlVisioDocument}, this
+ * only contains metadata about the root document in the OOXML package.
  */
 public class XDGFDocument {
 

Modified: poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java?rev=1709361&r1=1709360&r2=1709361&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java Mon Oct 19 06:26:57 2015
@@ -19,6 +19,7 @@ package org.apache.poi.xdgf.usermodel;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 
@@ -107,15 +108,21 @@ public class XmlVisioDocument extends PO
         _pages.onDocumentRead();
     }
 
+    /**
+     * Not currently implemented
+     */
     @Override
     public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
-        throw new UnsupportedOperationException("Not implemented");
+        return new ArrayList<PackagePart>();
     }
 
     //
     // Useful public API goes here
     //
     
+    /**
+     * @return pages ordered by page number
+     */
     public Collection<XDGFPage> getPages() {
         return _pages.getPageList();
     }

Added: poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java?rev=1709361&view=auto
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java (added)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java Mon Oct 19 06:26:57 2015
@@ -0,0 +1,41 @@
+package org.apache.poi.xdgf.usermodel.shape;
+
+import java.awt.geom.AffineTransform;
+
+import org.apache.poi.xdgf.usermodel.XDGFShape;
+
+/**
+ * Only visits text nodes, accumulates text content into a string
+ * 
+ * The text is returned in arbitrary order, with no regards to
+ * the location of the text on the page. This may change in the
+ * future.
+ */
+public class ShapeTextVisitor extends ShapeVisitor {
+
+    protected StringBuilder text = new StringBuilder();
+    
+    public static class TextAcceptor implements ShapeVisitorAcceptor {
+        public boolean accept(XDGFShape shape) {
+            return shape.hasText();
+        }
+    }
+    
+    protected ShapeVisitorAcceptor getAcceptor() {
+        return new TextAcceptor();
+    }
+
+    public void visit(XDGFShape shape, AffineTransform globalTransform,
+            int level) {
+        text.append(shape.getText().getTextContent().trim());
+        text.append('\n');
+    }
+
+    /**
+     * Call this after visitation has completed
+     */
+    public String getText() {
+        return text.toString();
+    }
+
+}

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=1709361&r1=1709360&r2=1709361&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Mon Oct 19 06:26:57 2015
@@ -44,6 +44,7 @@ import org.apache.poi.openxml4j.exceptio
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
@@ -271,12 +272,13 @@ public class TestExtractorFactory {
                 ExtractorFactory.createExtractor(vsd).getText().length() > 50
         );
         // Visio - vsdx
-        try {
-            ExtractorFactory.createExtractor(vsdx);
-            fail();
-        } catch(IllegalArgumentException e) {
-            // Good
-        }
+        assertTrue(
+                ExtractorFactory.createExtractor(vsdx)
+                instanceof XDGFVisioExtractor
+        );
+        assertTrue(
+                ExtractorFactory.createExtractor(vsdx).getText().length() > 20
+        );
 
         // Publisher
         assertTrue(
@@ -391,13 +393,15 @@ public class TestExtractorFactory {
                 ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
         );
         // Visio - vsdx
-        try {
-            ExtractorFactory.createExtractor(new FileInputStream(vsdx));
-            fail();
-        } catch(IllegalArgumentException e) {
-            // Good
-        }
+        assertTrue(
+                ExtractorFactory.createExtractor(new FileInputStream(vsdx))
+                instanceof XDGFVisioExtractor
+        );
+        assertTrue(
+                ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20
+        );
 
+        
         // Publisher
         assertTrue(
                 ExtractorFactory.createExtractor(new FileInputStream(pub))
@@ -551,6 +555,15 @@ public class TestExtractorFactory {
                 extractor.getText().length() > 120
         );
         extractor.close();
+        
+        // Visio
+        assertTrue(
+                ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
+                instanceof XDGFVisioExtractor
+        );
+        assertTrue(
+                extractor.getText().length() > 20
+        );
 
         // Text
         try {

Added: poi/trunk/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java?rev=1709361&view=auto
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java (added)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java Mon Oct 19 06:26:57 2015
@@ -0,0 +1,39 @@
+package org.apache.poi.xdgf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
+
+import junit.framework.TestCase;
+
+public class TestXDGFVisioExtractor extends TestCase {
+
+    private POIDataSamples diagrams;
+    private OPCPackage pkg;
+    private XmlVisioDocument xml;
+
+    protected void setUp() throws Exception {
+        diagrams = POIDataSamples.getDiagramInstance();
+        
+        pkg = OPCPackage.open(diagrams.openResourceAsStream("test_text_extraction.vsdx"));
+        xml = new XmlVisioDocument(pkg);
+    }
+
+    public void testGetSimpleText() throws IOException {
+        new XDGFVisioExtractor(xml).close();
+        new XDGFVisioExtractor(pkg).close();
+        
+        XDGFVisioExtractor extractor = new XDGFVisioExtractor(xml);
+        extractor.getText();
+        
+        String text = extractor.getText();
+        assertTrue(text.length() > 0);
+        
+        assertEquals("Text here\nText there\nText, text, everywhere!\nRouter here\n",
+                     text);
+        
+        extractor.close();
+    }
+}

Added: poi/trunk/test-data/diagram/test_text_extraction.vsdx
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/diagram/test_text_extraction.vsdx?rev=1709361&view=auto
==============================================================================
Binary files poi/trunk/test-data/diagram/test_text_extraction.vsdx (added) and poi/trunk/test-data/diagram/test_text_extraction.vsdx Mon Oct 19 06:26:57 2015 differ



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org