You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2014/11/30 17:21:39 UTC
svn commit: r1642561 - in /poi/trunk/src:
java/org/apache/poi/hssf/extractor/ java/org/apache/poi/hssf/record/
testcases/org/apache/poi/hssf/extractor/
Author: nick
Date: Sun Nov 30 16:21:39 2014
New Revision: 1642561
URL: http://svn.apache.org/r1642561
Log:
Track the codepage in old excel files, to be able to correctly decode the 8 bit strings in them
Modified:
poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java
poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java Sun Nov 30 16:21:39 2014
@@ -25,6 +25,7 @@ import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.CodepageRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.OldFormulaRecord;
@@ -110,6 +111,8 @@ public class OldExcelExtractor {
* for these old file formats
*/
public String getText() {
+ StringBuffer text = new StringBuffer();
+
// Work out what version we're dealing with
int bofSid = ris.getNextSid();
switch (bofSid) {
@@ -128,8 +131,10 @@ public class OldExcelExtractor {
default:
throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid);
}
+
+ // To track formats and encodings
+ CodepageRecord codepage = null;
- StringBuffer text = new StringBuffer();
while (ris.hasNextRecord()) {
int sid = ris.getNextSid();
ris.nextRecord();
@@ -139,6 +144,7 @@ public class OldExcelExtractor {
case OldLabelRecord.biff2_sid:
case OldLabelRecord.biff345_sid:
OldLabelRecord lr = new OldLabelRecord(ris);
+ lr.setCodePage(codepage);
text.append(lr.getValue());
text.append('\n');
break;
@@ -146,6 +152,7 @@ public class OldExcelExtractor {
case OldStringRecord.biff2_sid:
case OldStringRecord.biff345_sid:
OldStringRecord sr = new OldStringRecord(ris);
+ sr.setCodePage(codepage);
text.append(sr.getString());
text.append('\n');
break;
@@ -175,6 +182,10 @@ public class OldExcelExtractor {
handleNumericCell(text, rr.getRKNumber());
break;
+ case CodepageRecord.sid:
+ codepage = new CodepageRecord(ris);
+ break;
+
default:
ris.readFully(new byte[ris.remaining()]);
}
Modified: poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/CodepageRecord.java Sun Nov 30 16:21:39 2014
@@ -19,13 +19,15 @@
package org.apache.poi.hssf.record;
+import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.LittleEndianOutput;
/**
- * Title: Codepage Record<P>
- * Description: the default characterset. for the workbook<P>
- * REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
- * @author Andrew C. Oliver (acoliver at apache dot org)
+ * Title: Codepage Record
+ * <p>Description: the default characterset. for the workbook</p>
+ * <p>REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)</p>
+ * <p>Use {@link CodePageUtil} to turn these values into Java code pages
+ * to encode/decode strings.</p>
* @version 2.0-pre
*/
@@ -36,11 +38,10 @@ public final class CodepageRecord
private short field_1_codepage; // = 0;
/**
- * the likely correct value for CODEPAGE (at least for US versions). We could use
- * some help with international versions (which we do not have access to documentation
- * for)
+ * Excel 97+ (Biff 8) should always store strings as UTF-16LE or
+ * compressed versions of that. As such, this should always be
+ * 0x4b0 = UTF_16, except for files coming from older versions.
*/
-
public final static short CODEPAGE = ( short ) 0x4b0;
public CodepageRecord()
Modified: poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java Sun Nov 30 16:21:39 2014
@@ -32,9 +32,9 @@ public final class OldLabelRecord extend
public final static short biff2_sid = 0x0004;
public final static short biff345_sid = 0x0204;
- private short field_4_string_len;
- private byte[] field_5_bytes;
- //private XXXXX codepage; // TODO Implement for this and OldStringRecord
+ private short field_4_string_len;
+ private byte[] field_5_bytes;
+ private CodepageRecord codepage;
/**
* @param in the RecordInputstream to read the record from
@@ -61,6 +61,10 @@ public final class OldLabelRecord extend
}
}
+ public void setCodePage(CodepageRecord codepage) {
+ this.codepage = codepage;
+ }
+
/**
* get the number of characters this string contains
* @return number of characters
@@ -75,8 +79,7 @@ public final class OldLabelRecord extend
*/
public String getValue()
{
- // We really need the codepage here to do this right...
- return new String(field_5_bytes);
+ return OldStringRecord.getString(field_5_bytes, codepage);
}
/**
Modified: poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java Sun Nov 30 16:21:39 2014
@@ -17,6 +17,10 @@
package org.apache.poi.hssf.record;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.poi.util.CodePageUtil;
+
/**
* Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for
@@ -29,7 +33,7 @@ public final class OldStringRecord {
private short sid;
private short field_1_string_len;
private byte[] field_2_bytes;
- //private XXXXX codepage; // TODO Implement for this and OldLabelRecord
+ private CodepageRecord codepage;
/**
* @param in the RecordInputstream to read the record from
@@ -55,14 +59,30 @@ public final class OldStringRecord {
public short getSid() {
return sid;
}
+
+ public void setCodePage(CodepageRecord codepage) {
+ this.codepage = codepage;
+ }
/**
* @return The string represented by this record.
*/
public String getString()
{
- // We really need the codepage here to do this right...
- return new String(field_2_bytes);
+ return getString(field_2_bytes, codepage);
+ }
+
+ protected static String getString(byte[] data, CodepageRecord codepage) {
+ int cp = CodePageUtil.CP_ISO_8859_1;
+ if (codepage != null) {
+ cp = codepage.getCodepage();
+ }
+ try {
+ return CodePageUtil.getStringFromCodePage(data, cp);
+ } catch (UnsupportedEncodingException uee) {
+ // Hope the system default is ok...
+ return new String(data);
+ }
}
public String toString()
Modified: poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java?rev=1642561&r1=1642560&r2=1642561&view=diff
==============================================================================
--- poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java (original)
+++ poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java Sun Nov 30 16:21:39 2014
@@ -81,8 +81,7 @@ public final class TestOldExcelExtractor
// More complicated strings
assertContains(text, "$100,000 or more");
assertContains(text, "S corporation returns, Form 1120S [10,15]");
- // TODO Get these quotes working correctly
-// assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
+ assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
// Formula based strings
// TODO Find some then test
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org