You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/27 14:16:40 UTC

[tika] 06/30: Some SAS7BDAT metadata and unit testing

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 284965e789ec86364c31d471f16d6732c5e4e41d
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Fri Apr 27 17:34:52 2018 +0100

    Some SAS7BDAT metadata and unit testing
---
 .../org/apache/tika/parser/sas/SAS7BDATParser.java | 56 +++++++++++++++++++++-
 .../apache/tika/parser/sas/SAS7BDATParserTest.java | 35 ++++++++++++--
 2 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
index 4944c12..5992e15 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
@@ -33,6 +33,7 @@ import org.xml.sax.SAXException;
 
 import com.epam.parso.Column;
 import com.epam.parso.DataWriterUtil;
+import com.epam.parso.SasFileProperties;
 import com.epam.parso.SasFileReader;
 import com.epam.parso.impl.SasFileReaderImpl;
 
@@ -63,10 +64,61 @@ public class SAS7BDATParser extends AbstractParser {
         xhtml.startDocument();
         
         SasFileReader sas = new SasFileReaderImpl(stream);
+        SasFileProperties props = sas.getSasFileProperties();
 
-        // TODO Metadata
+        // Record the interesting parts of the file's metadata
+        metadata.set(TikaCoreProperties.TITLE, props.getName());
+        metadata.set(TikaCoreProperties.CREATED, props.getDateCreated());
+        metadata.set(TikaCoreProperties.MODIFIED, props.getDateModified());
 
-        // Output as a table
+        // TODO What about these?
+/*
+u64 - false
+compressionMethod - null
+endianness - 1
+encoding - windows-1252
+sessionEncoding - null
+fileType - DATA
+sasRelease - 9.0101M3
+serverType - XP_PRO
+osName - 
+osType - 
+headerLength - 1024
+pageLength - 8192
+pageCount - 1
+rowLength - 96
+rowCount - 31
+mixPageRowCount - 69
+columnsCount - 5
+*/
+
+        // TODO Should we output more Column info as metadata?
+/*
+5 Columns defined:
+ 1 - A
+  Label: A
+  Format: $58.
+  Size 58 of java.lang.String
+ 2 - B
+  Label: B
+  Format: 
+  Size 8 of java.lang.Number
+ 3 - C
+  Label: C
+  Format: DATE8.
+  Size 8 of java.lang.Number
+ 4 - D
+  Label: D
+  Format: DATETIME17.
+  Size 8 of java.lang.Number
+ 5 - E
+  Label: E
+  Format: 
+  Size 8 of java.lang.Number
+*/
+
+        // Output file contents as a table
+        xhtml.element("h1", props.getName());
         xhtml.startElement("table");
         xhtml.newline();
         
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
index 9f57c95..2f29a13 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
@@ -48,8 +48,20 @@ public class SAS7BDATParserTest extends TikaTest {
         }
 
         assertEquals("application/x-sas-data", metadata.get(Metadata.CONTENT_TYPE));
-        // TODO Test the metadata
-        // TODO Test the contents
+        assertEquals("TESTING", metadata.get(TikaCoreProperties.TITLE));
+
+        // Mon Jan 30 07:31:47 GMT 2017
+        assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.MODIFIED));
+        
+        // TODO Test the rest of the metadata
+        
+        String content = handler.toString();
+        assertContains("TESTING", content);
+        assertContains("\t3\t", content);
+        assertContains("\t10\t", content);
+        assertContains("\tThis is row", content);
+        assertContains(" of ", content);
     }
     
     @Test
@@ -64,9 +76,24 @@ public class SAS7BDATParserTest extends TikaTest {
         }
 
         assertEquals("application/x-sas-data", metadata.get(Metadata.CONTENT_TYPE));
-        // TODO Test the metadata
-        // TODO Test the contents
+        assertEquals("SHEET1", metadata.get(TikaCoreProperties.TITLE));
+
+        // Fri Mar 06 19:10:19 GMT 2015
+        assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.MODIFIED));
+        
+        // TODO Test the rest of the metadata
+        
+        String content = handler.toString();
+        assertContains("SHEET1", content);
+        assertContains("A\tB\tC", content);
+        assertContains("Num=0\t", content);
+        assertContains("Num=404242\t", content);
+        assertContains("\t0\t", content);
+        assertContains("\t404242\t", content);
+        assertContains("\t08Feb1904\t", content);
     }
 
     // TODO HTML contents unit test
+    // TODO Column names vs labels, with a different test file
 }