You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ta...@apache.org on 2017/08/30 16:29:53 UTC
svn commit: r1806712 - in /poi: site/src/documentation/content/xdocs/
trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/
trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/
trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/
trunk/test-data/docum...
Author: tallison
Date: Wed Aug 30 16:29:52 2017
New Revision: 1806712
URL: http://svn.apache.org/viewvc?rev=1806712&view=rev
Log:
61470 -- add extraction of content within ruby elements; allow users to concatenate or not concatenate phonetic strings. Default is to concatenate.
Added:
poi/trunk/test-data/document/61470.docx (with props)
Modified:
poi/site/src/documentation/content/xdocs/status.xml
poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java
poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
Modified: poi/site/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/site/src/documentation/content/xdocs/status.xml?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/site/src/documentation/content/xdocs/status.xml (original)
+++ poi/site/src/documentation/content/xdocs/status.xml Wed Aug 30 16:29:52 2017
@@ -56,8 +56,11 @@
when referring to both H??F and X??F formats.
-->
- <!-- release version="3.18-beta1" date="2017-11-??">
- </release -->
+ <release version="3.18-beta1" date="2017-11-??">
+ <actions>
+ <action dev="PD" type="fix" fixes-bug="61470" module="XWPF">Handle ruby (phonetic) elements in XWPFRun</action>
+ </actions>
+ </release>
<release version="3.17" date="2017-08-27">
<summary>
Modified: poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java Wed Aug 30 16:29:52 2017
@@ -33,6 +33,7 @@ import org.apache.poi.xwpf.usermodel.XWP
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFSDT;
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
import org.apache.poi.xwpf.usermodel.XWPFTable;
@@ -53,6 +54,7 @@ public class XWPFWordExtractor extends P
private XWPFDocument document;
private boolean fetchHyperlinks = false;
+ private boolean concatenatePhoneticRuns = true;
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
this(new XWPFDocument(container));
@@ -86,6 +88,14 @@ public class XWPFWordExtractor extends P
fetchHyperlinks = fetch;
}
+ /**
+ * Should we concatenate phonetic runs in extraction. Default is <code>true</code>
+ * @param concatenatePhoneticRuns
+ */
+ public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
+ this.concatenatePhoneticRuns = concatenatePhoneticRuns;
+ }
+
public String getText() {
StringBuffer text = new StringBuffer();
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
@@ -130,7 +140,11 @@ public class XWPFWordExtractor extends P
for (IRunElement run : paragraph.getRuns()) {
- text.append(run);
+ if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
+ text.append(((XWPFRun)run).text());
+ } else {
+ text.append(run);
+ }
if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document);
if (link != null)
Modified: poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java Wed Aug 30 16:29:52 2017
@@ -68,6 +68,8 @@ import org.openxmlformats.schemas.wordpr
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRuby;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRubyContent;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
@@ -1042,10 +1044,15 @@ public class XWPFRun implements ISDTCont
}
/**
- * Returns the string version of the text
+ * Returns the string version of the text and the phonetic string
*/
public String toString() {
- return text();
+ String phonetic = getPhonetic();
+ if (phonetic.length() > 0) {
+ return text() +" ("+phonetic.toString()+")";
+ } else {
+ return text();
+ }
}
/**
@@ -1061,71 +1068,139 @@ public class XWPFRun implements ISDTCont
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
- if (o instanceof CTText) {
+ if (o instanceof CTRuby) {
+ handleRuby(o, text, false);
+ continue;
+ }
+ _getText(o, text);
+ }
+ c.dispose();
+ return text.toString();
+
+ }
+
+ /**
+ *
+ * @return the phonetic (ruby) string associated with this run or an empty String if none exists
+ */
+ public String getPhonetic() {
+ StringBuffer text = new StringBuffer();
+
+ // Grab the text and tabs of the text run
+ // Do so in a way that preserves the ordering
+ XmlCursor c = run.newCursor();
+ c.selectPath("./*");
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if (o instanceof CTRuby) {
+ handleRuby(o, text, true);
+ }
+ }
+ c.dispose();
+ return text.toString();
+ }
+
+ /**
+ *
+ * @param rubyObj rubyobject
+ * @param text buffer to which to append the content
+ * @param extractPhonetic extract the phonetic (rt) component or the base component
+ */
+ private void handleRuby(XmlObject rubyObj, StringBuffer text, boolean extractPhonetic) {
+ XmlCursor c = rubyObj.newCursor();
+
+ //according to the spec, a ruby object
+ //has the phonetic (rt) first, then the actual text (base)
+ //second.
+
+ c.selectPath(".//*");
+ boolean inRT = false;
+ boolean inBase = false;
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if (o instanceof CTRubyContent) {
String tagName = o.getDomNode().getNodeName();
- // Field Codes (w:instrText, defined in spec sec. 17.16.23)
- // come up as instances of CTText, but we don't want them
- // in the normal text output
- if (!"w:instrText".equals(tagName)) {
- text.append(((CTText) o).getStringValue());
+ if ("w:rt".equals(tagName)) {
+ inRT = true;
+ } else if ("w:rubyBase".equals(tagName)) {
+ inRT = false;
+ inBase = true;
+ }
+ } else {
+ if (extractPhonetic && inRT) {
+ _getText(o, text);
+ } else if (! extractPhonetic && inBase) {
+ _getText(o, text);
}
}
+ }
+ c.dispose();
+ }
+
+ private void _getText(XmlObject o, StringBuffer text) {
+
+ if (o instanceof CTText) {
+ String tagName = o.getDomNode().getNodeName();
+ // Field Codes (w:instrText, defined in spec sec. 17.16.23)
+ // come up as instances of CTText, but we don't want them
+ // in the normal text output
+ if (!"w:instrText".equals(tagName)) {
+ text.append(((CTText) o).getStringValue());
+ }
+ }
- // Complex type evaluation (currently only for extraction of check boxes)
- if (o instanceof CTFldChar) {
- CTFldChar ctfldChar = ((CTFldChar) o);
- if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) {
- if (ctfldChar.getFfData() != null) {
- for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) {
- if (checkBox.getDefault() != null && checkBox.getDefault().getVal() == STOnOff.X_1) {
- text.append("|X|");
- } else {
- text.append("|_|");
- }
+ // Complex type evaluation (currently only for extraction of check boxes)
+ if (o instanceof CTFldChar) {
+ CTFldChar ctfldChar = ((CTFldChar) o);
+ if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) {
+ if (ctfldChar.getFfData() != null) {
+ for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) {
+ if (checkBox.getDefault() != null && checkBox.getDefault().getVal() == STOnOff.X_1) {
+ text.append("|X|");
+ } else {
+ text.append("|_|");
}
}
}
}
+ }
- if (o instanceof CTPTab) {
+ if (o instanceof CTPTab) {
+ text.append("\t");
+ }
+ if (o instanceof CTBr) {
+ text.append("\n");
+ }
+ if (o instanceof CTEmpty) {
+ // Some inline text elements get returned not as
+ // themselves, but as CTEmpty, owing to some odd
+ // definitions around line 5642 of the XSDs
+ // This bit works around it, and replicates the above
+ // rules for that case
+ String tagName = o.getDomNode().getNodeName();
+ if ("w:tab".equals(tagName) || "tab".equals(tagName)) {
text.append("\t");
}
- if (o instanceof CTBr) {
+ if ("w:br".equals(tagName) || "br".equals(tagName)) {
text.append("\n");
}
- if (o instanceof CTEmpty) {
- // Some inline text elements get returned not as
- // themselves, but as CTEmpty, owing to some odd
- // definitions around line 5642 of the XSDs
- // This bit works around it, and replicates the above
- // rules for that case
- String tagName = o.getDomNode().getNodeName();
- if ("w:tab".equals(tagName) || "tab".equals(tagName)) {
- text.append("\t");
- }
- if ("w:br".equals(tagName) || "br".equals(tagName)) {
- text.append("\n");
- }
- if ("w:cr".equals(tagName) || "cr".equals(tagName)) {
- text.append("\n");
- }
- }
- if (o instanceof CTFtnEdnRef) {
- CTFtnEdnRef ftn = (CTFtnEdnRef) o;
- String footnoteRef = ftn.getDomNode().getLocalName().equals("footnoteReference") ?
- "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
- text.append(footnoteRef);
+ if ("w:cr".equals(tagName) || "cr".equals(tagName)) {
+ text.append("\n");
}
}
+ if (o instanceof CTFtnEdnRef) {
+ CTFtnEdnRef ftn = (CTFtnEdnRef) o;
+ String footnoteRef = ftn.getDomNode().getLocalName().equals("footnoteReference") ?
+ "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
+ text.append(footnoteRef);
+ }
- c.dispose();
// Any picture text?
if (pictureText != null && pictureText.length() > 0) {
text.append("\n").append(pictureText);
}
- return text.toString();
}
/**
Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java?rev=1806712&r1=1806711&r2=1806712&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java Wed Aug 30 16:29:52 2017
@@ -421,4 +421,16 @@ public class TestXWPFWordExtractor exten
extractor.getText());
extractor.close();
}
+
+ public void testPhonetic() throws IOException {
+ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("61470.docx");
+ XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+ //expect: baseText (phoneticText)
+ assertEquals("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.getText().trim());
+ extractor.close();
+ extractor = new XWPFWordExtractor(doc);
+ extractor.setConcatenatePhoneticRuns(false);
+ assertEquals("\u6771\u4EAC", extractor.getText().trim());
+ }
+
}
Added: poi/trunk/test-data/document/61470.docx
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/document/61470.docx?rev=1806712&view=auto
==============================================================================
Binary file - no diff available.
Propchange: poi/trunk/test-data/document/61470.docx
------------------------------------------------------------------------------
--- svn:mime-type (added)
+++ svn:mime-type Wed Aug 30 16:29:52 2017
@@ -0,0 +1 @@
+application/vnd.openxmlformats-officedocument.wordprocessingml.document
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org