You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ye...@apache.org on 2009/06/27 12:39:51 UTC
svn commit: r788949 - in /poi/trunk/src: documentation/content/xdocs/ scratchpad/src/org/apache/poi/hwpf/ scratchpad/src/org/apache/poi/hwpf/extractor/ scratchpad/testcases/org/apache/poi/hwpf/data/ scratchpad/testcases/org/apache/poi/hwpf/extractor/

Author: yegor
Date: Sat Jun 27 10:39:51 2009
New Revision: 788949

URL: http://svn.apache.org/viewvc?rev=788949&view=rev
Log:
Support fo text extraction of footnotes, endnotes and comments in HWPF, see Bugzilla 47400

Added:
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc   (with props)
Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=788949&r1=788948&r2=788949&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Sat Jun 27 10:39:51 2009
@@ -33,6 +33,7 @@
 
     <changes>
         <release version="3.5-beta7" date="2009-??-??">
+           <action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
            <action dev="POI-DEVELOPERS" type="fix">47415 - Fixed PageSettingsBlock to allow multiple PLS records</action>
            <action dev="POI-DEVELOPERS" type="fix">47412 - Fixed concurrency issue with EscherProperties.initProps()</action>
            <action dev="POI-DEVELOPERS" type="fix">47143 - Fixed OOM in HSSFWorkbook#getAllPictures when reading .xls files containing metafiles</action>

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java?rev=788949&r1=788948&r2=788949&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java Sat Jun 27 10:39:51 2009
@@ -345,6 +345,28 @@
   }
 
   /**
+   * Returns the range which covers all the Endnotes.
+  */
+  public Range getEndnoteRange() {
+          return new Range(
+                          _cpSplit.getEndNoteStart(),
+                          _cpSplit.getEndNoteEnd(),
+                          this
+      );
+  }
+
+  /**
+   * Returns the range which covers all the Endnotes.
+  */
+  public Range getCommentsRange() {
+          return new Range(
+                          _cpSplit.getCommentsStart(),
+                          _cpSplit.getCommentsEnd(),
+                          this
+      );
+  }
+
+  /**
    * Returns the range which covers all "Header Stories".
    * A header story contains a header, footer, end note
    *  separators and footnote separators.

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java?rev=788949&r1=788948&r2=788949&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java Sat Jun 27 10:39:51 2009
@@ -22,6 +22,7 @@
 import java.io.FileInputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.Iterator;
+import java.util.Arrays;
 
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
@@ -95,34 +96,58 @@
 	 * Get the text from the word file, as an array with one String
 	 *  per paragraph
 	 */
-	public String[] getParagraphText() {
-		String[] ret;
+        public String[] getParagraphText() {
+                String[] ret;
 
-		// Extract using the model code
-		try {
-	    	Range r = doc.getRange();
-
-			ret = new String[r.numParagraphs()];
-			for(int i=0; i<ret.length; i++) {
-				Paragraph p = r.getParagraph(i);
-				ret[i] = p.text();
-
-				// Fix the line ending
-				if(ret[i].endsWith("\r")) {
-					ret[i] = ret[i] + "\n";
-				}
-			}
-		} catch(Exception e) {
-			// Something's up with turning the text pieces into paragraphs
-			// Fall back to ripping out the text pieces
-			ret = new String[1];
-			ret[0] = getTextFromPieces();
-		}
-
-		return ret;
-	}
+                // Extract using the model code
+                try {
+                        Range r = doc.getRange();
+
+                        ret = getParagraphText(r);
+                } catch (Exception e) {
+                        // Something's up with turning the text pieces into paragraphs
+                        // Fall back to ripping out the text pieces
+                        ret = new String[1];
+                        ret[0] = getTextFromPieces();
+                }
+
+                return ret;
+        }
+
+        public String[] getFootnoteText() {
+                Range r = doc.getFootnoteRange();
+
+                return getParagraphText(r);
+        }
+
+        public String[] getEndnoteText() {
+                Range r = doc.getEndnoteRange();
+
+                return getParagraphText(r);
+        }
+
+        public String[] getCommentsText() {
+                Range r = doc.getCommentsRange();
+
+                return getParagraphText(r);
+        }
+
+        private String[] getParagraphText(Range r) {
+                String[] ret;
+                ret = new String[r.numParagraphs()];
+                for (int i = 0; i < ret.length; i++) {
+                        Paragraph p = r.getParagraph(i);
+                        ret[i] = p.text();
+
+                        // Fix the line ending
+                        if (ret[i].endsWith("\r")) {
+                                ret[i] = ret[i] + "\n";
+                        }
+                }
+                return ret;
+        }
 
-	/**
+        /**
 	 * Add the header/footer text, if it's not empty
 	 */
 	private void appendHeaderFooter(String text, StringBuffer out) {

Added: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc?rev=788949&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc
------------------------------------------------------------------------------
    svn:executable = *

Propchange: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java?rev=788949&r1=788948&r2=788949&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java Sat Jun 27 10:39:51 2009
@@ -60,6 +60,8 @@
 	private String filename4;
 	// With unicode header and footer
 	private String filename5;
+        // With footnote
+        private String filename6;
 
     protected void setUp() throws Exception {
 		String dirname = System.getProperty("HWPF.testdata.path");
@@ -70,6 +72,7 @@
 		filename3 = pdirname + "/excel_with_embeded.xls";
 		filename4 = dirname + "/ThreeColHeadFoot.doc";
 		filename5 = dirname + "/HeaderFooterUnicode.doc";
+                filename6 = dirname + "/footnote.doc";
 
 		extractor = new WordExtractor(new FileInputStream(filename));
 		extractor2 = new WordExtractor(new FileInputStream(filename2));
@@ -226,4 +229,49 @@
     			text.indexOf("The footer, with") > -1
     	);
     }
+
+    public void testFootnote() throws Exception {
+        HWPFDocument doc = new HWPFDocument(
+                new FileInputStream(filename6)
+        );
+        extractor = new WordExtractor(doc);
+
+        String[] text = extractor.getFootnoteText();
+        StringBuffer b = new StringBuffer();
+        for (int i=0; i<text.length; i++) {
+            b.append(text[i]);
+        }
+
+        assertTrue(b.toString().contains("TestFootnote"));
+    }
+
+    public void testEndnote() throws Exception {
+        HWPFDocument doc = new HWPFDocument(
+                new FileInputStream(filename6)
+        );
+        extractor = new WordExtractor(doc);
+
+        String[] text = extractor.getEndnoteText();
+        StringBuffer b = new StringBuffer();
+        for (int i=0; i<text.length; i++) {
+            b.append(text[i]);
+        }
+
+        assertTrue(b.toString().contains("TestEndnote"));
+    }
+
+    public void testComments() throws Exception {
+        HWPFDocument doc = new HWPFDocument(
+                new FileInputStream(filename6)
+        );
+        extractor = new WordExtractor(doc);
+
+        String[] text = extractor.getCommentsText();
+        StringBuffer b = new StringBuffer();
+        for (int i=0; i<text.length; i++) {
+            b.append(text[i]);
+        }
+
+        assertTrue(b.toString().contains("TestComment"));
+    }
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org