You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ta...@apache.org on 2017/08/30 16:29:53 UTC
svn commit: r1806712 - in /poi: site/src/documentation/content/xdocs/ trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/ trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/ trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/ trunk/test-data/docum...

Author: tallison
Date: Wed Aug 30 16:29:52 2017
New Revision: 1806712

URL: http://svn.apache.org/viewvc?rev=1806712&view=rev
Log:
61470 -- add extraction of content within ruby elements; allow users to concatenate or not concatenate phonetic strings.  Default is to concatenate.

Added:
    poi/trunk/test-data/document/61470.docx   (with props)
Modified:
    poi/site/src/documentation/content/xdocs/status.xml
    poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
    poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java

Modified: poi/site/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/site/src/documentation/content/xdocs/status.xml?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/site/src/documentation/content/xdocs/status.xml (original)
+++ poi/site/src/documentation/content/xdocs/status.xml Wed Aug 30 16:29:52 2017
@@ -56,8 +56,11 @@
           when referring to both H??F and X??F formats.
     -->
 
-    <!-- release version="3.18-beta1" date="2017-11-??">
-    </release -->
+    <release version="3.18-beta1" date="2017-11-??">
+      <actions>
+        <action dev="PD" type="fix" fixes-bug="61470" module="XWPF">Handle ruby (phonetic) elements in XWPFRun</action>
+      </actions>
+    </release>
 
     <release version="3.17" date="2017-08-27">
       <summary>

Modified: poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java Wed Aug 30 16:29:52 2017
@@ -33,6 +33,7 @@ import org.apache.poi.xwpf.usermodel.XWP
 import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
 import org.apache.poi.xwpf.usermodel.XWPFSDT;
 import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
 import org.apache.poi.xwpf.usermodel.XWPFTable;
@@ -53,6 +54,7 @@ public class XWPFWordExtractor extends P
 
     private XWPFDocument document;
     private boolean fetchHyperlinks = false;
+    private boolean concatenatePhoneticRuns = true;
 
     public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
         this(new XWPFDocument(container));
@@ -86,6 +88,14 @@ public class XWPFWordExtractor extends P
         fetchHyperlinks = fetch;
     }
 
+    /**
+     * Should we concatenate phonetic runs in extraction.  Default is <code>true</code>
+     * @param concatenatePhoneticRuns
+     */
+    public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
+        this.concatenatePhoneticRuns = concatenatePhoneticRuns;
+    }
+
     public String getText() {
         StringBuffer text = new StringBuffer();
         XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
@@ -130,7 +140,11 @@ public class XWPFWordExtractor extends P
 
 
         for (IRunElement run : paragraph.getRuns()) {
-            text.append(run);
+            if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
+                text.append(((XWPFRun)run).text());
+            } else {
+                text.append(run);
+            }
             if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
                 XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document);
                 if (link != null)

Modified: poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java Wed Aug 30 16:29:52 2017
@@ -68,6 +68,8 @@ import org.openxmlformats.schemas.wordpr
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRuby;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRubyContent;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
@@ -1042,10 +1044,15 @@ public class XWPFRun implements ISDTCont
     }
 
     /**
-     * Returns the string version of the text
+     * Returns the string version of the text and the phonetic string
      */
     public String toString() {
-        return text();
+        String phonetic = getPhonetic();
+        if (phonetic.length() > 0) {
+            return text() +" ("+phonetic.toString()+")";
+        } else {
+            return text();
+        }
     }
 
     /**
@@ -1061,71 +1068,139 @@ public class XWPFRun implements ISDTCont
         c.selectPath("./*");
         while (c.toNextSelection()) {
             XmlObject o = c.getObject();
-            if (o instanceof CTText) {
+            if (o instanceof CTRuby) {
+                handleRuby(o, text, false);
+                continue;
+            }
+            _getText(o, text);
+        }
+        c.dispose();
+        return text.toString();
+
+    }
+
+    /**
+     *
+     * @return the phonetic (ruby) string associated with this run or an empty String if none exists
+     */
+    public String getPhonetic() {
+        StringBuffer text = new StringBuffer();
+
+        // Grab the text and tabs of the text run
+        // Do so in a way that preserves the ordering
+        XmlCursor c = run.newCursor();
+        c.selectPath("./*");
+        while (c.toNextSelection()) {
+            XmlObject o = c.getObject();
+            if (o instanceof CTRuby) {
+                handleRuby(o, text, true);
+            }
+        }
+        c.dispose();
+        return text.toString();
+    }
+
+    /**
+     *
+     * @param rubyObj rubyobject
+     * @param text buffer to which to append the content
+     * @param extractPhonetic extract the phonetic (rt) component or the base component
+     */
+    private void handleRuby(XmlObject rubyObj, StringBuffer text, boolean extractPhonetic) {
+        XmlCursor c = rubyObj.newCursor();
+
+        //according to the spec, a ruby object
+        //has the phonetic (rt) first, then the actual text (base)
+        //second.
+
+        c.selectPath(".//*");
+        boolean inRT = false;
+        boolean inBase = false;
+        while (c.toNextSelection()) {
+            XmlObject o = c.getObject();
+            if (o instanceof CTRubyContent) {
                 String tagName = o.getDomNode().getNodeName();
-                // Field Codes (w:instrText, defined in spec sec. 17.16.23)
-                //  come up as instances of CTText, but we don't want them
-                //  in the normal text output
-                if (!"w:instrText".equals(tagName)) {
-                    text.append(((CTText) o).getStringValue());
+                if ("w:rt".equals(tagName)) {
+                    inRT = true;
+                } else if ("w:rubyBase".equals(tagName)) {
+                    inRT = false;
+                    inBase = true;
+                }
+            } else {
+                if (extractPhonetic && inRT) {
+                    _getText(o, text);
+                } else if (! extractPhonetic && inBase) {
+                    _getText(o, text);
                 }
             }
+        }
+        c.dispose();
+    }
+
+    private void _getText(XmlObject o, StringBuffer text) {
+
+        if (o instanceof CTText) {
+            String tagName = o.getDomNode().getNodeName();
+            // Field Codes (w:instrText, defined in spec sec. 17.16.23)
+            //  come up as instances of CTText, but we don't want them
+            //  in the normal text output
+            if (!"w:instrText".equals(tagName)) {
+                text.append(((CTText) o).getStringValue());
+            }
+        }
 
-            // Complex type evaluation (currently only for extraction of check boxes)
-            if (o instanceof CTFldChar) {
-                CTFldChar ctfldChar = ((CTFldChar) o);
-                if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) {
-                    if (ctfldChar.getFfData() != null) {
-                        for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) {
-                            if (checkBox.getDefault() != null && checkBox.getDefault().getVal() == STOnOff.X_1) {
-                                text.append("|X|");
-                            } else {
-                                text.append("|_|");
-                            }
+        // Complex type evaluation (currently only for extraction of check boxes)
+        if (o instanceof CTFldChar) {
+            CTFldChar ctfldChar = ((CTFldChar) o);
+            if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) {
+                if (ctfldChar.getFfData() != null) {
+                    for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) {
+                        if (checkBox.getDefault() != null && checkBox.getDefault().getVal() == STOnOff.X_1) {
+                            text.append("|X|");
+                        } else {
+                            text.append("|_|");
                         }
                     }
                 }
             }
+        }
 
-            if (o instanceof CTPTab) {
+        if (o instanceof CTPTab) {
+            text.append("\t");
+        }
+        if (o instanceof CTBr) {
+            text.append("\n");
+        }
+        if (o instanceof CTEmpty) {
+            // Some inline text elements get returned not as
+            //  themselves, but as CTEmpty, owing to some odd
+            //  definitions around line 5642 of the XSDs
+            // This bit works around it, and replicates the above
+            //  rules for that case
+            String tagName = o.getDomNode().getNodeName();
+            if ("w:tab".equals(tagName) || "tab".equals(tagName)) {
                 text.append("\t");
             }
-            if (o instanceof CTBr) {
+            if ("w:br".equals(tagName) || "br".equals(tagName)) {
                 text.append("\n");
             }
-            if (o instanceof CTEmpty) {
-                // Some inline text elements get returned not as
-                //  themselves, but as CTEmpty, owing to some odd
-                //  definitions around line 5642 of the XSDs
-                // This bit works around it, and replicates the above
-                //  rules for that case
-                String tagName = o.getDomNode().getNodeName();
-                if ("w:tab".equals(tagName) || "tab".equals(tagName)) {
-                    text.append("\t");
-                }
-                if ("w:br".equals(tagName) || "br".equals(tagName)) {
-                    text.append("\n");
-                }
-                if ("w:cr".equals(tagName) || "cr".equals(tagName)) {
-                    text.append("\n");
-                }
-            }
-            if (o instanceof CTFtnEdnRef) {
-                CTFtnEdnRef ftn = (CTFtnEdnRef) o;
-                String footnoteRef = ftn.getDomNode().getLocalName().equals("footnoteReference") ?
-                        "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
-                text.append(footnoteRef);
+            if ("w:cr".equals(tagName) || "cr".equals(tagName)) {
+                text.append("\n");
             }
         }
+        if (o instanceof CTFtnEdnRef) {
+            CTFtnEdnRef ftn = (CTFtnEdnRef) o;
+            String footnoteRef = ftn.getDomNode().getLocalName().equals("footnoteReference") ?
+                    "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
+            text.append(footnoteRef);
+        }
 
-        c.dispose();
 
         // Any picture text?
         if (pictureText != null && pictureText.length() > 0) {
             text.append("\n").append(pictureText);
         }
 
-        return text.toString();
     }
 
     /**

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java Wed Aug 30 16:29:52 2017
@@ -421,4 +421,16 @@ public class TestXWPFWordExtractor exten
                 extractor.getText());
         extractor.close();
     }
+
+    public void testPhonetic() throws IOException {
+        XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("61470.docx");
+        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+        //expect: baseText (phoneticText)
+        assertEquals("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.getText().trim());
+        extractor.close();
+        extractor = new XWPFWordExtractor(doc);
+        extractor.setConcatenatePhoneticRuns(false);
+        assertEquals("\u6771\u4EAC", extractor.getText().trim());
+    }
+
 }

Added: poi/trunk/test-data/document/61470.docx
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/document/61470.docx?rev=1806712&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/document/61470.docx
------------------------------------------------------------------------------
--- svn:mime-type (added)
+++ svn:mime-type Wed Aug 30 16:29:52 2017
@@ -0,0 +1 @@
+application/vnd.openxmlformats-officedocument.wordprocessingml.document



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org