You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/27 14:16:40 UTC
[tika] 06/30: Some SAS7BDAT metadata and unit testing
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 284965e789ec86364c31d471f16d6732c5e4e41d
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Fri Apr 27 17:34:52 2018 +0100
Some SAS7BDAT metadata and unit testing
---
.../org/apache/tika/parser/sas/SAS7BDATParser.java | 56 +++++++++++++++++++++-
.../apache/tika/parser/sas/SAS7BDATParserTest.java | 35 ++++++++++++--
2 files changed, 85 insertions(+), 6 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
index 4944c12..5992e15 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
@@ -33,6 +33,7 @@ import org.xml.sax.SAXException;
import com.epam.parso.Column;
import com.epam.parso.DataWriterUtil;
+import com.epam.parso.SasFileProperties;
import com.epam.parso.SasFileReader;
import com.epam.parso.impl.SasFileReaderImpl;
@@ -63,10 +64,61 @@ public class SAS7BDATParser extends AbstractParser {
xhtml.startDocument();
SasFileReader sas = new SasFileReaderImpl(stream);
+ SasFileProperties props = sas.getSasFileProperties();
- // TODO Metadata
+ // Record the interesting parts of the file's metadata
+ metadata.set(TikaCoreProperties.TITLE, props.getName());
+ metadata.set(TikaCoreProperties.CREATED, props.getDateCreated());
+ metadata.set(TikaCoreProperties.MODIFIED, props.getDateModified());
- // Output as a table
+ // TODO What about these?
+/*
+u64 - false
+compressionMethod - null
+endianness - 1
+encoding - windows-1252
+sessionEncoding - null
+fileType - DATA
+sasRelease - 9.0101M3
+serverType - XP_PRO
+osName -
+osType -
+headerLength - 1024
+pageLength - 8192
+pageCount - 1
+rowLength - 96
+rowCount - 31
+mixPageRowCount - 69
+columnsCount - 5
+*/
+
+ // TODO Should we output more Column info as metadata?
+/*
+5 Columns defined:
+ 1 - A
+ Label: A
+ Format: $58.
+ Size 58 of java.lang.String
+ 2 - B
+ Label: B
+ Format:
+ Size 8 of java.lang.Number
+ 3 - C
+ Label: C
+ Format: DATE8.
+ Size 8 of java.lang.Number
+ 4 - D
+ Label: D
+ Format: DATETIME17.
+ Size 8 of java.lang.Number
+ 5 - E
+ Label: E
+ Format:
+ Size 8 of java.lang.Number
+*/
+
+ // Output file contents as a table
+ xhtml.element("h1", props.getName());
xhtml.startElement("table");
xhtml.newline();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
index 9f57c95..2f29a13 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
@@ -48,8 +48,20 @@ public class SAS7BDATParserTest extends TikaTest {
}
assertEquals("application/x-sas-data", metadata.get(Metadata.CONTENT_TYPE));
- // TODO Test the metadata
- // TODO Test the contents
+ assertEquals("TESTING", metadata.get(TikaCoreProperties.TITLE));
+
+ // Mon Jan 30 07:31:47 GMT 2017
+ assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.MODIFIED));
+
+ // TODO Test the rest of the metadata
+
+ String content = handler.toString();
+ assertContains("TESTING", content);
+ assertContains("\t3\t", content);
+ assertContains("\t10\t", content);
+ assertContains("\tThis is row", content);
+ assertContains(" of ", content);
}
@Test
@@ -64,9 +76,24 @@ public class SAS7BDATParserTest extends TikaTest {
}
assertEquals("application/x-sas-data", metadata.get(Metadata.CONTENT_TYPE));
- // TODO Test the metadata
- // TODO Test the contents
+ assertEquals("SHEET1", metadata.get(TikaCoreProperties.TITLE));
+
+ // Fri Mar 06 19:10:19 GMT 2015
+ assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.MODIFIED));
+
+ // TODO Test the rest of the metadata
+
+ String content = handler.toString();
+ assertContains("SHEET1", content);
+ assertContains("A\tB\tC", content);
+ assertContains("Num=0\t", content);
+ assertContains("Num=404242\t", content);
+ assertContains("\t0\t", content);
+ assertContains("\t404242\t", content);
+ assertContains("\t08Feb1904\t", content);
}
// TODO HTML contents unit test
+ // TODO Column names vs labels, with a different test file
}