You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2014/11/30 01:48:17 UTC

svn commit: r1642491 - in /poi/trunk/src: java/org/apache/poi/hssf/extractor/ java/org/apache/poi/hssf/record/ testcases/org/apache/poi/hssf/extractor/

Author: nick
Date: Sun Nov 30 00:48:17 2014
New Revision: 1642491

URL: http://svn.apache.org/r1642491
Log:
Further Excel 4 text extractor support, for TIKA-1490

Added:
    poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
Modified:
    poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
    poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java
    poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
    poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java

Modified: poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java Sun Nov 30 00:48:17 2014
@@ -22,9 +22,13 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.NumberRecord;
 import org.apache.poi.hssf.record.OldLabelRecord;
+import org.apache.poi.hssf.record.OldStringRecord;
+import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.RecordInputStream;
+import org.apache.poi.ss.usermodel.Cell;
 
 /**
  * A text extractor for very old (pre-OLE2) Excel files,
@@ -76,20 +80,44 @@ public class OldExcelExtractor {
             ris.nextRecord();
 
             switch (sid) {
-                case LabelRecord.sid:
+                // label - 5.63 - TODO Needs codepages
+                case OldLabelRecord.biff2_sid:
+                case OldLabelRecord.biff345_sid:
                     OldLabelRecord lr = new OldLabelRecord(ris);
                     text.append(lr.getValue());
                     text.append('\n');
                     break;
+                // string - 5.102 - TODO Needs codepages
+                case OldStringRecord.biff2_sid:
+                case OldStringRecord.biff345_sid:
+                    OldStringRecord sr = new OldStringRecord(ris);
+                    text.append(sr.getString());
+                    text.append('\n');
+                    break;
+                // number - 5.71 - TODO Needs format strings
+                case NumberRecord.sid:
+                    NumberRecord nr = new NumberRecord(ris);
+                    text.append(nr.getValue());
+                    text.append('\n');
+                    break;
+/*                    
+                case OldFormulaRecord.sid:
+                    FormulaRecord fr = new FormulaRecord(ris);
+System.out.println(fr.getCachedResultType());                    
+                    if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) {
+                        text.append(fr.getValue());
+                        text.append('\n');
+                    }
+*/
+                case RKRecord.sid:
+                    RKRecord rr = new RKRecord(ris);
+                    text.append(rr.getRKNumber());
+                    text.append('\n');
+                    break;
                 default:
                     ris.readFully(new byte[ris.remaining()]);
+      //              text.append(" = " + ris.getSid() + " = \n");
             }
-
-            // label - 5.63 - TODO Needs codepages
-            // number - 5.71
-            // rk - 5.87
-            // string - 5.102
-
         }
 
         return text.toString();

Modified: poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/FormulaRecord.java Sun Nov 30 00:48:17 2014
@@ -36,6 +36,7 @@ import org.apache.poi.util.LittleEndianO
 public final class FormulaRecord extends CellRecord {
 
 	public static final short sid = 0x0006;   // docs say 406...because of a bug Microsoft support site article #Q184647)
+	public static final short olderSid = 0x0406; // older biff versions do manage 406! 
 	private static int FIXED_SIZE = 14; // double + short + int
 
 	private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001);

Modified: poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java (original)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldLabelRecord.java Sun Nov 30 00:48:17 2014
@@ -39,7 +39,7 @@ public final class OldLabelRecord extend
     private short             field_3_xf_index;   // Biff 3+
     private short             field_4_string_len;
     private byte[]            field_5_bytes;
-    //private XXXXX           codepage; // TODO
+    //private XXXXX           codepage; // TODO Implement for this and OldStringRecord
 
     /**
      * @param in the RecordInputstream to read the record from

Added: poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java?rev=1642491&view=auto
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java (added)
+++ poi/trunk/src/java/org/apache/poi/hssf/record/OldStringRecord.java Sun Nov 30 00:48:17 2014
@@ -0,0 +1,78 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.record;
+
+
+/**
+ * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for 
+ *  formula string results.
+ */
+public final class OldStringRecord {
+    public final static short biff2_sid = 0x0007;
+    public final static short biff345_sid = 0x0207;
+
+    private short             sid;
+    private short             field_1_string_len;
+    private byte[]            field_2_bytes;
+    //private XXXXX           codepage; // TODO Implement for this and OldLabelRecord
+
+    /**
+     * @param in the RecordInputstream to read the record from
+     */
+    public OldStringRecord(RecordInputStream in) {
+        sid = in.getSid();
+        
+        if (in.getSid() == biff2_sid) {
+            field_1_string_len  = (short)in.readUByte();
+        } else {
+            field_1_string_len   = in.readShort();
+        }
+
+        // Can only decode properly later when you know the codepage
+        field_2_bytes = new byte[field_1_string_len];
+        in.read(field_2_bytes, 0, field_1_string_len);
+    }
+
+    public boolean isBiff2() {
+        return sid == biff2_sid;
+    }
+
+    public short getSid() {
+        return sid;
+    }
+
+    /**
+     * @return The string represented by this record.
+     */
+    public String getString()
+    {
+        // We really need the codepage here to do this right...
+        return new String(field_2_bytes);
+    }
+
+    public String toString()
+    {
+        StringBuffer buffer = new StringBuffer();
+
+        buffer.append("[OLD STRING]\n");
+        buffer.append("    .string            = ")
+            .append(getString()).append("\n");
+        buffer.append("[/OLD STRING]\n");
+        return buffer.toString();
+    }
+}

Modified: poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java?rev=1642491&r1=1642490&r2=1642491&view=diff
==============================================================================
--- poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java (original)
+++ poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java Sun Nov 30 00:48:17 2014
@@ -46,7 +46,45 @@ public final class TestOldExcelExtractor
         // Check we find a few words we expect in there
         assertTrue(text, text.contains("Size"));
         assertTrue(text, text.contains("Returns"));
+        
+        // Check we find a few numbers we expect in there
+        assertTrue(text, text.contains("11"));
+        assertTrue(text, text.contains("784"));
     }
 
-    // TODO Rest of the tests
+    public void testStrings() {
+        OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+        String text = extractor.getText();
+
+        // Simple strings
+        assertTrue(text, text.contains("Table 10 -- Examination Coverage:"));
+        assertTrue(text, text.contains("Recommended and Average Recommended Additional Tax After"));
+        assertTrue(text, text.contains("Individual income tax returns, total"));
+        
+        // More complicated strings
+        assertTrue(text, text.contains("$100,000 or more"));
+        assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]"));
+        // TODO Get these quotes working correctly
+//        assertTrue(text, text.contains("individual income tax return “short forms.”"));
+        
+        // Formula based strings
+        // TODO Find some then test
+    }
+
+    public void testFormattedNumbers() {
+        OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+        String text = extractor.getText();
+
+        // Simple numbers
+        assertTrue(text, text.contains("151"));
+        assertTrue(text, text.contains("784"));
+        
+        // Numbers which come from formulas
+        // TODO
+//        assertTrue(text, text.contains("0.40"));
+//        assertTrue(text, text.contains("624"));
+        
+        // Formatted numbers
+        // TODO
+    }
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org