You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2014/11/30 01:48:17 UTC
svn commit: r1642491 - in /poi/trunk/src:
java/org/apache/poi/hssf/extractor/ java/org/apache/poi/hssf/record/
testcases/org/apache/poi/hssf/extractor/
Author: nick
Date: Sun Nov 30 00:48:17 2014
New Revision: 1642491
URL: http://svn.apache.org/r1642491
Log:
Further Excel 4 text extractor support, for TIKA-1490
Added:
poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
Modified:
poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java
poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java Sun Nov 30 00:48:17 2014
@@ -22,9 +22,13 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
-import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.OldLabelRecord;
+import org.apache.poi.hssf.record.OldStringRecord;
+import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.RecordInputStream;
+import org.apache.poi.ss.usermodel.Cell;
/**
* A text extractor for very old (pre-OLE2) Excel files,
@@ -76,20 +80,44 @@ public class OldExcelExtractor {
ris.nextRecord();
switch (sid) {
- case LabelRecord.sid:
+ // label - 5.63 - TODO Needs codepages
+ case OldLabelRecord.biff2_sid:
+ case OldLabelRecord.biff345_sid:
OldLabelRecord lr = new OldLabelRecord(ris);
text.append(lr.getValue());
text.append('\n');
break;
+ // string - 5.102 - TODO Needs codepages
+ case OldStringRecord.biff2_sid:
+ case OldStringRecord.biff345_sid:
+ OldStringRecord sr = new OldStringRecord(ris);
+ text.append(sr.getString());
+ text.append('\n');
+ break;
+ // number - 5.71 - TODO Needs format strings
+ case NumberRecord.sid:
+ NumberRecord nr = new NumberRecord(ris);
+ text.append(nr.getValue());
+ text.append('\n');
+ break;
+/*
+ case OldFormulaRecord.sid:
+ FormulaRecord fr = new FormulaRecord(ris);
+System.out.println(fr.getCachedResultType());
+ if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) {
+ text.append(fr.getValue());
+ text.append('\n');
+ }
+*/
+ case RKRecord.sid:
+ RKRecord rr = new RKRecord(ris);
+ text.append(rr.getRKNumber());
+ text.append('\n');
+ break;
default:
ris.readFully(new byte[ris.remaining()]);
+ // text.append(" = " + ris.getSid() + " = \n");
}
-
- // label - 5.63 - TODO Needs codepages
- // number - 5.71
- // rk - 5.87
- // string - 5.102
-
}
return text.toString();
Modified: poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java Sun Nov 30 00:48:17 2014
@@ -36,6 +36,7 @@ import org.apache.poi.util.LittleEndianO
public final class FormulaRecord extends CellRecord {
public static final short sid = 0x0006; // docs say 406...because of a bug Microsoft support site article #Q184647)
+ public static final short olderSid = 0x0406; // older biff versions do manage 406!
private static int FIXED_SIZE = 14; // double + short + int
private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001);
Modified: poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java Sun Nov 30 00:48:17 2014
@@ -39,7 +39,7 @@ public final class OldLabelRecord extend
private short field_3_xf_index; // Biff 3+
private short field_4_string_len;
private byte[] field_5_bytes;
- //private XXXXX codepage; // TODO
+ //private XXXXX codepage; // TODO Implement for this and OldStringRecord
/**
* @param in the RecordInputstream to read the record from
Added: poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java?rev=1642491&view=auto
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java (added)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java Sun Nov 30 00:48:17 2014
@@ -0,0 +1,78 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.record;
+
+
+/**
+ * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for
+ * formula string results.
+ */
+public final class OldStringRecord {
+ public final static short biff2_sid = 0x0007;
+ public final static short biff345_sid = 0x0207;
+
+ private short sid;
+ private short field_1_string_len;
+ private byte[] field_2_bytes;
+ //private XXXXX codepage; // TODO Implement for this and OldLabelRecord
+
+ /**
+ * @param in the RecordInputstream to read the record from
+ */
+ public OldStringRecord(RecordInputStream in) {
+ sid = in.getSid();
+
+ if (in.getSid() == biff2_sid) {
+ field_1_string_len = (short)in.readUByte();
+ } else {
+ field_1_string_len = in.readShort();
+ }
+
+ // Can only decode properly later when you know the codepage
+ field_2_bytes = new byte[field_1_string_len];
+ in.read(field_2_bytes, 0, field_1_string_len);
+ }
+
+ public boolean isBiff2() {
+ return sid == biff2_sid;
+ }
+
+ public short getSid() {
+ return sid;
+ }
+
+ /**
+ * @return The string represented by this record.
+ */
+ public String getString()
+ {
+ // We really need the codepage here to do this right...
+ return new String(field_2_bytes);
+ }
+
+ public String toString()
+ {
+ StringBuffer buffer = new StringBuffer();
+
+ buffer.append("[OLD STRING]\n");
+ buffer.append(" .string = ")
+ .append(getString()).append("\n");
+ buffer.append("[/OLD STRING]\n");
+ return buffer.toString();
+ }
+}
Modified: poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java (original)
+++ poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java Sun Nov 30 00:48:17 2014
@@ -46,7 +46,45 @@ public final class TestOldExcelExtractor
// Check we find a few words we expect in there
assertTrue(text, text.contains("Size"));
assertTrue(text, text.contains("Returns"));
+
+ // Check we find a few numbers we expect in there
+ assertTrue(text, text.contains("11"));
+ assertTrue(text, text.contains("784"));
}
- // TODO Rest of the tests
+ public void testStrings() {
+ OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+ String text = extractor.getText();
+
+ // Simple strings
+ assertTrue(text, text.contains("Table 10 -- Examination Coverage:"));
+ assertTrue(text, text.contains("Recommended and Average Recommended Additional Tax After"));
+ assertTrue(text, text.contains("Individual income tax returns, total"));
+
+ // More complicated strings
+ assertTrue(text, text.contains("$100,000 or more"));
+ assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]"));
+ // TODO Get these quotes working correctly
+// assertTrue(text, text.contains("individual income tax return âshort forms.â"));
+
+ // Formula based strings
+ // TODO Find some then test
+ }
+
+ public void testFormattedNumbers() {
+ OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+ String text = extractor.getText();
+
+ // Simple numbers
+ assertTrue(text, text.contains("151"));
+ assertTrue(text, text.contains("784"));
+
+ // Numbers which come from formulas
+ // TODO
+// assertTrue(text, text.contains("0.40"));
+// assertTrue(text, text.contains("624"));
+
+ // Formatted numbers
+ // TODO
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org