You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ce...@apache.org on 2015/03/22 14:33:43 UTC

svn commit: r1668367 - in /poi/trunk: src/integrationtest/org/apache/poi/stress/ src/scratchpad/src/org/apache/poi/hdf/extractor/ src/scratchpad/testcases/org/apache/poi/hdf/extractor/ test-data/document/

Author: centic
Date: Sun Mar 22 13:33:43 2015
New Revision: 1668367

URL: http://svn.apache.org/r1668367
Log:
Bug 47304: use fixed encoding when extracting text in WordDocument

Added:
    poi/trunk/test-data/document/47304.doc
Modified:
    poi/trunk/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java?rev=1668367&r1=1668366&r2=1668367&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java Sun Mar 22 13:33:43 2015
@@ -18,12 +18,21 @@ package org.apache.poi.stress;
 
 import static org.junit.Assert.assertNotNull;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
 import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringWriter;
 
+import org.apache.poi.hdf.extractor.WordDocument;
 import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.junit.Test;
 
+@SuppressWarnings("deprecation")
 public class HWPFFileHandler extends POIFSFileHandler {
 	@Override
 	public void handleFile(InputStream stream) throws Exception {
@@ -33,16 +42,53 @@ public class HWPFFileHandler extends POI
 		assertNotNull(doc.getEndnotes());
 		
 		handlePOIDocument(doc);
+		
+		// fails for many documents, but is deprecated anyway... 
+		// handleWordDocument(doc);
+	}
+
+	protected void handleWordDocument(HWPFDocument doc) throws IOException {
+		ByteArrayOutputStream outStream = new ByteArrayOutputStream();
+		doc.write(outStream);
+
+		WordDocument wordDoc = new WordDocument(new ByteArrayInputStream(outStream.toByteArray()));
+        
+        StringWriter docTextWriter = new StringWriter();
+        PrintWriter out = new PrintWriter(docTextWriter);
+        try {
+        	wordDoc.writeAllText(out);
+        } finally {
+        	out.close();
+        }
+        docTextWriter.close();
 	}
 
+
+
 	// a test-case to test this locally without executing the full TestAllFiles
 	@Test
 	public void test() throws Exception {
-		InputStream stream = new FileInputStream("test-data/document/HeaderFooterUnicode.doc");
+		File file = new File("test-data/document/47304.doc");
+
+		InputStream stream = new FileInputStream(file);
 		try {
 			handleFile(stream);
 		} finally {
 			stream.close();
 		}
+		
+		handleExtracting(file);
+		
+		stream = new FileInputStream(file);
+		try {
+			WordExtractor extractor = new WordExtractor(stream);
+			try {
+				assertNotNull(extractor.getText());
+			} finally {
+				extractor.close();
+			}
+		} finally {
+			stream.close();
+		}
 	}
 }
\ No newline at end of file

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java?rev=1668367&r1=1668366&r2=1668367&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java Sun Mar 22 13:33:43 2015
@@ -177,7 +177,7 @@ public final class WordDocument {
       }
       else
       {
-	String sText = new String(_header, start, end-start);
+	String sText = new String(_header, start, end-start, "windows-1252");
 	out.write(sText);
       }
     }

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java?rev=1668367&r1=1668366&r2=1668367&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java Sun Mar 22 13:33:43 2015
@@ -17,6 +17,15 @@
 
 package org.apache.poi.hdf.extractor;
 
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.junit.Test;
 
 
@@ -31,4 +40,31 @@ public class TestWordDocument {
         //WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"});
         WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"});
     }
+
+    @SuppressWarnings("deprecation")
+	@Test
+    public void test47304() throws IOException {
+    	HWPFDocument doc = HWPFTestDataSamples.openSampleFile("47304.doc");
+    	assertNotNull(doc);
+    	
+    	WordExtractor extractor = new WordExtractor(doc);
+        String text = extractor.getText();
+        //System.out.println(text);
+        assertTrue("Had: " + text, text.contains("Just  a \u201Ctest\u201D"));
+        extractor.close();
+        
+		WordDocument wordDoc = new WordDocument("test-data/document/47304.doc");
+        
+        StringWriter docTextWriter = new StringWriter();
+        PrintWriter out = new PrintWriter(docTextWriter);
+        try {
+        	wordDoc.writeAllText(out);
+        } finally {
+        	out.close();
+        }
+        docTextWriter.close();
+
+        //System.out.println(docTextWriter.toString());
+        assertTrue("Had: " + docTextWriter.toString(), docTextWriter.toString().contains("Just  a \u201Ctest\u201D"));
+    }
 }

Added: poi/trunk/test-data/document/47304.doc
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/document/47304.doc?rev=1668367&view=auto
==============================================================================
Binary files poi/trunk/test-data/document/47304.doc (added) and poi/trunk/test-data/document/47304.doc Sun Mar 22 13:33:43 2015 differ



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org