You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2014/11/30 17:21:39 UTC

svn commit: r1642561 - in /poi/trunk/src: java/org/apache/poi/hssf/extractor/ java/org/apache/poi/hssf/record/ testcases/org/apache/poi/hssf/extractor/

Author: nick
Date: Sun Nov 30 16:21:39 2014
New Revision: 1642561

URL: http://svn.apache.org/r1642561
Log:
Track the codepage in old excel files, to be able to correctly decode the 8 bit strings in them

Modified:
    poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
    poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java
    poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
    poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
    poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java

Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java Sun Nov 30 16:21:39 2014
@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.CodepageRecord;
 import org.apache.poi.hssf.record.FormulaRecord;
 import org.apache.poi.hssf.record.NumberRecord;
 import org.apache.poi.hssf.record.OldFormulaRecord;
@@ -110,6 +111,8 @@ public class OldExcelExtractor {
      *  for these old file formats
      */
     public String getText() {
+        StringBuffer text = new StringBuffer();
+        
         // Work out what version we're dealing with
         int bofSid = ris.getNextSid();
         switch (bofSid) {
@@ -128,8 +131,10 @@ public class OldExcelExtractor {
             default:
                 throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid); 
         }
+        
+        // To track formats and encodings
+        CodepageRecord codepage = null;
 
-        StringBuffer text = new StringBuffer();
         while (ris.hasNextRecord()) {
             int sid = ris.getNextSid();
             ris.nextRecord();
@@ -139,6 +144,7 @@ public class OldExcelExtractor {
                 case OldLabelRecord.biff2_sid:
                 case OldLabelRecord.biff345_sid:
                     OldLabelRecord lr = new OldLabelRecord(ris);
+                    lr.setCodePage(codepage);
                     text.append(lr.getValue());
                     text.append('\n');
                     break;
@@ -146,6 +152,7 @@ public class OldExcelExtractor {
                 case OldStringRecord.biff2_sid:
                 case OldStringRecord.biff345_sid:
                     OldStringRecord sr = new OldStringRecord(ris);
+                    sr.setCodePage(codepage);
                     text.append(sr.getString());
                     text.append('\n');
                     break;
@@ -175,6 +182,10 @@ public class OldExcelExtractor {
                     handleNumericCell(text, rr.getRKNumber());
                     break;
                     
+                case CodepageRecord.sid:
+                    codepage = new CodepageRecord(ris);
+                    break;
+                    
                 default:
                     ris.readFully(new byte[ris.remaining()]);
             }

Modified: poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java Sun Nov 30 16:21:39 2014
@@ -19,13 +19,15 @@
 
 package org.apache.poi.hssf.record;
 
+import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.LittleEndianOutput;
 
 /**
- * Title: Codepage Record<P>
- * Description:  the default characterset. for the workbook<P>
- * REFERENCE:  PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
- * @author Andrew C. Oliver (acoliver at apache dot org)
+ * Title: Codepage Record
+ * <p>Description:  the default characterset. for the workbook</p>
+ * <p>REFERENCE:  PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)</p>
+ * <p>Use {@link CodePageUtil} to turn these values into Java code pages
+ *  to encode/decode strings.</p>
  * @version 2.0-pre
  */
 
@@ -36,11 +38,10 @@ public final class CodepageRecord
     private short             field_1_codepage;   // = 0;
 
     /**
-     * the likely correct value for CODEPAGE (at least for US versions).  We could use
-     * some help with international versions (which we do not have access to documentation
-     * for)
+     * Excel 97+ (Biff 8) should always store strings as UTF-16LE or
+     *  compressed versions of that. As such, this should always be
+     *  0x4b0 = UTF_16, except for files coming from older versions.
      */
-
     public final static short CODEPAGE = ( short ) 0x4b0;
 
     public CodepageRecord()

Modified: poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java Sun Nov 30 16:21:39 2014
@@ -32,9 +32,9 @@ public final class OldLabelRecord extend
     public final static short biff2_sid = 0x0004;
     public final static short biff345_sid = 0x0204;
 
-    private short     field_4_string_len;
-    private byte[]    field_5_bytes;
-    //private XXXXX   codepage; // TODO Implement for this and OldStringRecord
+    private short          field_4_string_len;
+    private byte[]         field_5_bytes;
+    private CodepageRecord codepage;
 
     /**
      * @param in the RecordInputstream to read the record from
@@ -61,6 +61,10 @@ public final class OldLabelRecord extend
         }
     }
 
+    public void setCodePage(CodepageRecord codepage) {
+        this.codepage = codepage;
+    }
+    
     /**
      * get the number of characters this string contains
      * @return number of characters
@@ -75,8 +79,7 @@ public final class OldLabelRecord extend
      */
     public String getValue()
     {
-        // We really need the codepage here to do this right...
-        return new String(field_5_bytes);
+        return OldStringRecord.getString(field_5_bytes, codepage);
     }
 
     /**

Modified: poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java Sun Nov 30 16:21:39 2014
@@ -17,6 +17,10 @@
 
 package org.apache.poi.hssf.record;
 
+import java.io.UnsupportedEncodingException;
+
+import org.apache.poi.util.CodePageUtil;
+
 
 /**
  * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for 
@@ -29,7 +33,7 @@ public final class OldStringRecord {
     private short             sid;
     private short             field_1_string_len;
     private byte[]            field_2_bytes;
-    //private XXXXX           codepage; // TODO Implement for this and OldLabelRecord
+    private CodepageRecord    codepage;
 
     /**
      * @param in the RecordInputstream to read the record from
@@ -55,14 +59,30 @@ public final class OldStringRecord {
     public short getSid() {
         return sid;
     }
+    
+    public void setCodePage(CodepageRecord codepage) {
+        this.codepage = codepage;
+    }
 
     /**
      * @return The string represented by this record.
      */
     public String getString()
     {
-        // We really need the codepage here to do this right...
-        return new String(field_2_bytes);
+        return getString(field_2_bytes, codepage);
+    }
+    
+    protected static String getString(byte[] data, CodepageRecord codepage) {
+        int cp = CodePageUtil.CP_ISO_8859_1;
+        if (codepage != null) {
+            cp = codepage.getCodepage();
+        }
+        try {
+            return CodePageUtil.getStringFromCodePage(data, cp);
+        } catch (UnsupportedEncodingException uee) {
+            // Hope the system default is ok...
+            return new String(data);
+        }
     }
 
     public String toString()

Modified: poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java (original)
+++ poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java Sun Nov 30 16:21:39 2014
@@ -81,8 +81,7 @@ public final class TestOldExcelExtractor
         // More complicated strings
         assertContains(text, "$100,000 or more");
         assertContains(text, "S corporation returns, Form 1120S [10,15]");
-        // TODO Get these quotes working correctly
-//        assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
+        assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
         
         // Formula based strings
         // TODO Find some then test



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org