You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2020/05/06 02:03:37 UTC
[drill] branch master updated: DRILL-7716: Create Format Plugin for
SPSS Files
This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 5a067da DRILL-7716: Create Format Plugin for SPSS Files
5a067da is described below
commit 5a067daf114ebdc61334989bfcdc249fb335f68d
Author: Charles Givre <cg...@apache.org>
AuthorDate: Tue May 5 20:34:11 2020 -0400
DRILL-7716: Create Format Plugin for SPSS Files
---
.../drill/exec/store/excel/TestExcelFormat.java | 34 +--
.../drill/exec/store/hdf5/TestHDF5Format.java | 40 +---
contrib/format-spss/README.md | 93 +++++++++
contrib/format-spss/pom.xml | 88 ++++++++
.../drill/exec/store/spss/SpssBatchReader.java | 231 +++++++++++++++++++++
.../drill/exec/store/spss/SpssFormatConfig.java | 78 +++++++
.../drill/exec/store/spss/SpssFormatPlugin.java | 85 ++++++++
.../main/resources/bootstrap-format-plugins.json | 37 ++++
.../src/main/resources/drill-module.conf | 23 ++
.../drill/exec/store/spss/TestSpssReader.java | 163 +++++++++++++++
.../src/test/resources/spss/testdata.sav | Bin 0 -> 14629 bytes
.../native/client/src/protobuf/UserBitShared.pb.cc | 16 +-
.../native/client/src/protobuf/UserBitShared.pb.h | 1 +
contrib/pom.xml | 1 +
distribution/pom.xml | 5 +
distribution/src/assemble/component.xml | 1 +
.../java/org/apache/drill/test/ClusterTest.java | 4 +-
.../java/org/apache/drill/test/QueryTestUtil.java | 40 ++++
pom.xml | 4 +-
.../org/apache/drill/exec/proto/UserBitShared.java | 22 +-
protocol/src/main/protobuf/UserBitShared.proto | 1 +
21 files changed, 879 insertions(+), 88 deletions(-)
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index 5700b40..3eed776 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -20,41 +20,27 @@ package org.apache.drill.exec.store.excel;
import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.types.TypeProtos;
-import org.apache.drill.exec.ExecTest;
import org.apache.drill.exec.record.metadata.TupleMetadata;
import org.apache.drill.exec.rpc.RpcException;
import org.apache.drill.exec.physical.rowSet.RowSet;
import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
-import org.apache.drill.exec.store.dfs.ZipCodec;
import org.apache.drill.test.ClusterFixture;
import org.apache.drill.test.ClusterTest;
import org.apache.drill.test.QueryBuilder;
import org.apache.drill.test.rowSet.RowSetComparison;
import org.apache.drill.exec.record.metadata.SchemaBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.CommonConfigurationKeys;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IOUtils;
-import java.io.FileInputStream;
import java.nio.file.Paths;
-import org.apache.hadoop.io.compress.CompressionCodec;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.apache.drill.categories.RowSetTests;
-import org.apache.hadoop.io.compress.CompressionCodecFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
@Category(RowSetTests.class)
public class TestExcelFormat extends ClusterTest {
@@ -399,22 +385,4 @@ public class TestExcelFormat extends ClusterTest {
new RowSetComparison(expected).verifyAndClearAll(results);
}
-
- private void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
- FileSystem fs = ExecTest.getLocalFileSystem();
- Configuration conf = fs.getConf();
- conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY, ZipCodec.class.getCanonicalName());
- CompressionCodecFactory factory = new CompressionCodecFactory(conf);
-
- CompressionCodec codec = factory.getCodecByName(codecName);
- assertNotNull(codecName + " is not found", codec);
-
- Path outFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), outFileName);
- Path inFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), fileName);
-
- try (InputStream inputStream = new FileInputStream(inFile.toUri().toString());
- OutputStream outputStream = codec.createOutputStream(fs.create(outFile))) {
- IOUtils.copyBytes(inputStream, outputStream, fs.getConf(), false);
- }
- }
}
diff --git a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
index a34488b..03ec0a9 100644
--- a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
+++ b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
@@ -20,10 +20,8 @@ package org.apache.drill.exec.store.hdf5;
import org.apache.drill.categories.RowSetTests;
import org.apache.drill.common.types.TypeProtos;
-import org.apache.drill.exec.ExecTest;
import org.apache.drill.exec.record.metadata.TupleMetadata;
import org.apache.drill.exec.rpc.RpcException;
-import org.apache.drill.exec.store.dfs.ZipCodec;
import org.apache.drill.test.ClusterFixtureBuilder;
import org.apache.drill.test.ClusterTest;
import org.apache.drill.exec.physical.rowSet.RowSet;
@@ -31,28 +29,17 @@ import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
import org.apache.drill.test.ClusterFixture;
import org.apache.drill.test.rowSet.RowSetComparison;
import org.apache.drill.exec.record.metadata.SchemaBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.CommonConfigurationKeys;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IOUtils;
-import org.apache.hadoop.io.compress.CompressionCodec;
-import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
@Category(RowSetTests.class)
public class TestHDF5Format extends ClusterTest {
@@ -908,29 +895,4 @@ public class TestHDF5Format extends ClusterTest {
new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
}
-
- /**
- * Generates a compressed file for testing
- * @param fileName the input file to be compressed
- * @param codecName the CODEC to be used for compression
- * @param outFileName the output file name
- * @throws IOException Throws IO exception if the file cannot be found or any other IO error
- */
- private void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
- FileSystem fs = ExecTest.getLocalFileSystem();
- Configuration conf = fs.getConf();
- conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY, ZipCodec.class.getCanonicalName());
- CompressionCodecFactory factory = new CompressionCodecFactory(conf);
-
- CompressionCodec codec = factory.getCodecByName(codecName);
- assertNotNull(codecName + " is not found", codec);
-
- Path outFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), outFileName);
- Path inFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), fileName);
-
- try (InputStream inputStream = new FileInputStream(inFile.toUri().toString());
- OutputStream outputStream = codec.createOutputStream(fs.create(outFile))) {
- IOUtils.copyBytes(inputStream, outputStream, fs.getConf(), false);
- }
- }
}
diff --git a/contrib/format-spss/README.md b/contrib/format-spss/README.md
new file mode 100644
index 0000000..75aad3f
--- /dev/null
+++ b/contrib/format-spss/README.md
@@ -0,0 +1,93 @@
+# Format Plugin for SPSS (SAV) Files
+This format plugin enables Apache Drill to read and query Statistical Package for the Social Sciences
+(SPSS) (or Statistical Product and Service Solutions) data files. According
+to Wikipedia: (https://en.wikipedia.org/wiki/SPSS)
+ ***
+ SPSS is a widely used program for statistical analysis in social science. It is also used by market
+ researchers, health researchers, survey companies, government, education researchers, marketing
+ organizations, data miners, and others. The original SPSS manual (Nie, Bent & Hull, 1970) has been
+ described as one of "sociology's most influential books" for allowing ordinary researchers to do their
+ own statistical analysis. In addition to statistical analysis, data management (case selection, file
+ reshaping, creating derived data) and data documentation (a metadata dictionary is stored in the
+ datafile) are features of the base software.
+ ***
+
+## Configuration
+To configure Drill to read SPSS files, simply add the following code to the formats section of your
+file-based storage plugin. This should happen automatically for the default
+ `cp`, `dfs`, and `S3` storage plugins.
+
+Other than the file extensions, there are no variables to configure.
+
+```json
+"spss": {
+ "type": "spss",
+ "extensions": ["sav"]
+ }
+```
+
+## Data Model
+SPSS only supports two data types: Numeric and Strings. Drill maps these to `DOUBLE` and `VARCHAR`
+respectively. However, for some numeric columns, SPSS maps these numbers to
+text, similar to an `enum` field in Java.
+
+For instance, a field called `Survey` might have labels as shown below:
+
+ <table>
+ <tr>
+ <th>Value</th>
+ <th>Text</th>
+ </tr>
+ <tr>
+ <td>1</td>
+ <td>Yes</td>
+ </tr>
+ <tr>
+ <td>2</td>
+ <td>No</td>
+ </tr>
+ <tr>
+ <td>99</td>
+ <td>No Answer</td>
+ </tr>
+ </table>
+
+For situations like this, Drill will create two columns. In the example above you would get a column
+called `Survey` which has the numeric value (1,2 or 99) as well as a column called `Survey_value` which
+will map the integer to the appropriate value. Thus, the results would look something like this:
+
+ <table>
+ <tr>
+ <th>`Survey`</th>
+ <th>`Survey_value`</th>
+ </tr>
+ <tr>
+ <td>1</td>
+ <td>Yes</td>
+ </tr>
+ <tr>
+ <td>1</td>
+ <td>Yes</td>
+ </tr>
+ <tr>
+ <td>1</td>
+ <td>Yes</td>
+ </tr>
+ <tr>
+ <td>2</td>
+ <td>No</td>
+ </tr>
+ <tr>
+ <td>1</td>
+ <td>Yes</td>
+ </tr>
+ <tr>
+ <td>2</td>
+ <td>No</td>
+ </tr>
+ <tr>
+ <td>99</td>
+ <td>No Answer</td>
+ </tr>
+ </table>
+
\ No newline at end of file
diff --git a/contrib/format-spss/pom.xml b/contrib/format-spss/pom.xml
new file mode 100644
index 0000000..20cbef4
--- /dev/null
+++ b/contrib/format-spss/pom.xml
@@ -0,0 +1,88 @@
+<?xml version="1.0"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>drill-contrib-parent</artifactId>
+ <groupId>org.apache.drill.contrib</groupId>
+ <version>1.18.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>drill-format-spss</artifactId>
+ <name>contrib/format-spss</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.drill.exec</groupId>
+ <artifactId>drill-java-exec</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.bedatadriven.spss</groupId>
+ <artifactId>spss-reader</artifactId>
+ <version>1.3</version>
+ </dependency>
+
+ <!-- Test dependencies -->
+ <dependency>
+ <groupId>org.apache.drill.exec</groupId>
+ <artifactId>drill-java-exec</artifactId>
+ <classifier>tests</classifier>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.drill</groupId>
+ <artifactId>drill-common</artifactId>
+ <classifier>tests</classifier>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>copy-java-sources</id>
+ <phase>process-sources</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>${basedir}/target/classes/org/apache/drill/exec/store/spss
+ </outputDirectory>
+ <resources>
+ <resource>
+ <directory>src/main/java/org/apache/drill/exec/store/spss</directory>
+ <filtering>true</filtering>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssBatchReader.java b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssBatchReader.java
new file mode 100644
index 0000000..8d6ba99
--- /dev/null
+++ b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssBatchReader.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import com.bedatadriven.spss.SpssDataFileReader;
+import com.bedatadriven.spss.SpssVariable;
+import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.exceptions.CustomErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.vector.accessor.ScalarWriter;
+import org.apache.hadoop.mapred.FileSplit;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class SpssBatchReader implements ManagedReader<FileSchemaNegotiator> {
+
+ private static final Logger logger = LoggerFactory.getLogger(SpssBatchReader.class);
+
+ private static final String VALUE_LABEL = "_value";
+
+ private FileSplit split;
+
+ private InputStream fsStream;
+
+ private SpssDataFileReader spssReader;
+
+ private RowSetLoader rowWriter;
+
+ private List<SpssVariable> variableList;
+
+ private List<SpssColumnWriter> writerList;
+
+ private CustomErrorContext errorContext;
+
+
+ public static class SpssReaderConfig {
+
+ protected final SpssFormatPlugin plugin;
+
+ public SpssReaderConfig(SpssFormatPlugin plugin) {
+ this.plugin = plugin;
+ }
+ }
+
+ @Override
+ public boolean open(FileSchemaNegotiator negotiator) {
+ split = negotiator.split();
+ openFile(negotiator);
+ negotiator.tableSchema(buildSchema(), true);
+ errorContext = negotiator.parentErrorContext();
+ ResultSetLoader loader = negotiator.build();
+ rowWriter = loader.writer();
+ buildReaderList();
+
+ return true;
+ }
+
+ @Override
+ public boolean next() {
+ while (!rowWriter.isFull()) {
+ if (!processNextRow()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public void close() {
+ if (fsStream != null) {
+ AutoCloseables.closeSilently(fsStream);
+ fsStream = null;
+ }
+ }
+
+ private void openFile(FileSchemaNegotiator negotiator) {
+ try {
+ fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
+ spssReader = new SpssDataFileReader(fsStream);
+ } catch (IOException e) {
+ throw UserException
+ .dataReadError(e)
+ .message("Unable to open SPSS File %s", split.getPath())
+ .addContext(e.getMessage())
+ .addContext(errorContext)
+ .build(logger);
+ }
+ }
+
+ private boolean processNextRow() {
+ try {
+ // Stop reading when you run out of data
+ if (!spssReader.readNextCase()) {
+ return false;
+ }
+
+ rowWriter.start();
+ for (SpssColumnWriter spssColumnWriter : writerList) {
+ spssColumnWriter.load(spssReader);
+ }
+ rowWriter.save();
+
+ } catch (IOException e) {
+ throw UserException
+ .dataReadError(e)
+ .message("Error reading SPSS File.")
+ .addContext(errorContext)
+ .build(logger);
+ }
+ return true;
+ }
+
+ private TupleMetadata buildSchema() {
+ SchemaBuilder builder = new SchemaBuilder();
+ variableList = spssReader.getVariables();
+
+ for (SpssVariable variable : variableList) {
+ String varName = variable.getVariableName();
+
+ if (variable.isNumeric()) {
+ builder.addNullable(varName, TypeProtos.MinorType.FLOAT8);
+
+ // Check if the column has lookups associated with it
+ if (variable.getValueLabels() != null && variable.getValueLabels().size() > 0) {
+ builder.addNullable(varName + VALUE_LABEL, TypeProtos.MinorType.VARCHAR);
+ }
+
+ } else {
+ builder.addNullable(varName, TypeProtos.MinorType.VARCHAR);
+ }
+ }
+ return builder.buildSchema();
+ }
+
+ private void buildReaderList() {
+ writerList = new ArrayList<>();
+
+ for (SpssVariable variable : variableList) {
+ if (variable.isNumeric()) {
+ writerList.add(new NumericSpssColumnWriter(variable.getIndex(), variable.getVariableName(), rowWriter, spssReader));
+ } else {
+ writerList.add(new StringSpssColumnWriter(variable.getIndex(), variable.getVariableName(), rowWriter));
+ }
+ }
+ }
+
+ public abstract static class SpssColumnWriter {
+ final String columnName;
+ final ScalarWriter writer;
+ final int columnIndex;
+
+ public SpssColumnWriter(int columnIndex, String columnName, ScalarWriter writer) {
+ this.columnIndex = columnIndex;
+ this.columnName = columnName;
+ this.writer = writer;
+ }
+
+ public abstract void load (SpssDataFileReader reader);
+ }
+
+ public static class StringSpssColumnWriter extends SpssColumnWriter {
+
+ StringSpssColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+ }
+
+ @Override
+ public void load(SpssDataFileReader reader) {
+ writer.setString(reader.getStringValue(columnIndex));
+ }
+ }
+
+ public static class NumericSpssColumnWriter extends SpssColumnWriter {
+
+ ScalarWriter labelWriter;
+ Map<Double, String> labels;
+
+ NumericSpssColumnWriter(int columnIndex, String columnName, RowSetLoader rowWriter, SpssDataFileReader reader) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+ if (reader.getValueLabels(columnName) != null && reader.getValueLabels(columnName).size() != 0) {
+ labelWriter = rowWriter.scalar(columnName + VALUE_LABEL);
+ labels = reader.getValueLabels(columnIndex);
+ }
+ }
+
+ @Override
+ public void load(SpssDataFileReader reader) {
+ double value = reader.getDoubleValue(columnIndex);
+
+ if (labelWriter != null) {
+ String labelValue = labels.get(value);
+ if (labelValue == null) {
+ labelWriter.setNull();
+ } else {
+ labelWriter.setString(labelValue);
+ }
+ }
+ writer.setDouble(value);
+ }
+ }
+}
diff --git a/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatConfig.java b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatConfig.java
new file mode 100644
index 0000000..ad35cd8
--- /dev/null
+++ b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatConfig.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.exec.store.spss.SpssBatchReader.SpssReaderConfig;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+@JsonTypeName(SpssFormatPlugin.DEFAULT_NAME)
+@JsonInclude(JsonInclude.Include.NON_DEFAULT)
+public class SpssFormatConfig implements FormatPluginConfig {
+ private final List<String> extensions;
+
+ // Omitted properties take reasonable defaults
+ @JsonCreator
+ public SpssFormatConfig(@JsonProperty("extensions") List<String> extensions) {
+ this.extensions = extensions == null ? Collections.singletonList("sav") : ImmutableList.copyOf(extensions);
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_DEFAULT)
+ public List<String> getExtensions() {
+ return extensions;
+ }
+
+ public SpssReaderConfig getReaderConfig(SpssFormatPlugin plugin) {
+ return new SpssReaderConfig(plugin);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(extensions);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ SpssFormatConfig other = (SpssFormatConfig) obj;
+ return Objects.equals(extensions, other.extensions);
+ }
+
+ @Override
+ public String toString() {
+ return new PlanStringBuilder(this)
+ .field("extensions", extensions)
+ .toString();
+ }
+}
diff --git a/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatPlugin.java b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatPlugin.java
new file mode 100644
index 0000000..5e780b2
--- /dev/null
+++ b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatPlugin.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import org.apache.drill.common.logical.StoragePluginConfig;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.proto.UserBitShared;
+import org.apache.drill.exec.server.DrillbitContext;
+import org.apache.drill.exec.server.options.OptionManager;
+import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
+import org.apache.hadoop.conf.Configuration;
+
+
+public class SpssFormatPlugin extends EasyFormatPlugin<SpssFormatConfig> {
+
+ protected static final String DEFAULT_NAME = "spss";
+
+ private static class SpssReaderFactory extends FileReaderFactory {
+
+ @Override
+ public ManagedReader<? extends FileSchemaNegotiator> newReader() {
+ return new SpssBatchReader();
+ }
+ }
+
+ public SpssFormatPlugin(String name, DrillbitContext context,
+ Configuration fsConf, StoragePluginConfig storageConfig,
+ SpssFormatConfig formatConfig) {
+ super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig);
+ }
+
+ private static EasyFormatConfig easyConfig(Configuration fsConf, SpssFormatConfig pluginConfig) {
+ EasyFormatConfig config = new EasyFormatConfig();
+ config.readable = true;
+ config.writable = false;
+ config.blockSplittable = false;
+ config.compressible = true;
+ config.supportsProjectPushdown = true;
+ config.extensions = pluginConfig.getExtensions();
+ config.fsConf = fsConf;
+ config.defaultName = DEFAULT_NAME;
+ config.readerOperatorType = UserBitShared.CoreOperatorType.SPSS_SUB_SCAN_VALUE;
+ config.useEnhancedScan = true;
+ return config;
+ }
+
+ @Override
+ public ManagedReader<? extends FileSchemaNegotiator> newBatchReader(
+ EasySubScan scan, OptionManager options) {
+ return new SpssBatchReader();
+ }
+
+ @Override
+ protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) {
+ FileScanBuilder builder = new FileScanBuilder();
+ builder.setReaderFactory(new SpssReaderFactory());
+
+ initScanBuilder(builder, scan);
+ builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR));
+ return builder;
+ }
+}
diff --git a/contrib/format-spss/src/main/resources/bootstrap-format-plugins.json b/contrib/format-spss/src/main/resources/bootstrap-format-plugins.json
new file mode 100644
index 0000000..7d4f250
--- /dev/null
+++ b/contrib/format-spss/src/main/resources/bootstrap-format-plugins.json
@@ -0,0 +1,37 @@
+{
+ "storage": {
+ "dfs": {
+ "type": "file",
+ "formats": {
+ "spss": {
+ "type": "spss",
+ "extensions": [
+ "sav"
+ ]
+ }
+ }
+ },
+ "cp": {
+ "type": "file",
+ "formats": {
+ "spss": {
+ "type": "spss",
+ "extensions": [
+ "sav"
+ ]
+ }
+ }
+ },
+ "s3": {
+ "type": "file",
+ "formats": {
+ "spss": {
+ "type": "spss",
+ "extensions": [
+ "sav"
+ ]
+ }
+ }
+ }
+ }
+}
diff --git a/contrib/format-spss/src/main/resources/drill-module.conf b/contrib/format-spss/src/main/resources/drill-module.conf
new file mode 100644
index 0000000..2e35ff7
--- /dev/null
+++ b/contrib/format-spss/src/main/resources/drill-module.conf
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file tells Drill to consider this module when class path scanning.
+# This file can also include any supplementary configuration information.
+# This file is in HOCON format, see https://github.com/typesafehub/config/blob/master/HOCON.md for more information.
+
+drill.classpath.scanning.packages += "org.apache.drill.exec.store.spss"
diff --git a/contrib/format-spss/src/test/java/org/apache/drill/exec/store/spss/TestSpssReader.java b/contrib/format-spss/src/test/java/org/apache/drill/exec/store/spss/TestSpssReader.java
new file mode 100644
index 0000000..b54c4f8
--- /dev/null
+++ b/contrib/format-spss/src/test/java/org/apache/drill/exec/store/spss/TestSpssReader.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import org.apache.drill.categories.RowSetTests;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryBuilder;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.nio.file.Paths;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
+
+@Category(RowSetTests.class)
+public class TestSpssReader extends ClusterTest {
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));
+
+ // Needed for compressed file unit test
+ dirTestWatcher.copyResourceToRoot(Paths.get("spss/"));
+ }
+
+ @Test
+ public void testStarQuery() throws Exception {
+ String sql = "SELECT * FROM dfs.`spss/testdata.sav` WHERE d16=4";
+
+ QueryBuilder q = client.queryBuilder().sql(sql);
+ RowSet results = q.rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("ID", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Urban", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("District", TypeProtos.MinorType.FLOAT8)
+ .addNullable("District_value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("Province", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Province_value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("Interviewer", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Date", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_1", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_1_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d6_2", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_2_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d6_3", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_3_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d6_4", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_4_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("s_1", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d6_5", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_5_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d6_6", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_6_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d6_7", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d6_7_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("q1", TypeProtos.MinorType.FLOAT8)
+ .addNullable("q1_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("q2", TypeProtos.MinorType.FLOAT8)
+ .addNullable("q2_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d7a", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d7a_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d7b", TypeProtos.MinorType.FLOAT8)
+ .addNullable("d7b_Value", TypeProtos.MinorType.VARCHAR)
+ .addNullable("d16", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Stratum", TypeProtos.MinorType.FLOAT8)
+ .addNullable("S1_IP", TypeProtos.MinorType.FLOAT8)
+ .addNullable("S2_IP", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Sample_Weight", TypeProtos.MinorType.FLOAT8)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(47.0, 1.0, "Urban", 101.0, "Kabul", 1.0, "Kabul", 151.0, 1.34557632E10, 1.0, "Yes", 2.0, "No", 2.0, "No", 2.0, "No", "", 2.0, "No", 2.0, "No", 2.0, "No", 1.0, "Good", 2.0, "The same", 5.0, "Housewife (not working outside of the home)", 97.0, "Not Asked", 4.0, 121.0, 0.007463305415042708, 0.006666666666666667, 20098.33333333333)
+ .addRow(53.0, 1.0, "Urban", 101.0, "Kabul", 1.0, "Kabul", 151.0, 1.34557632E10, 1.0, "Yes", 2.0, "No", 2.0, "No", 2.0, "No", "", 2.0, "No", 2.0, "No", 2.0, "No", 1.0, "Good", 2.0, "The same", 5.0, "Housewife (not working outside of the home)", 97.0, "Not Asked", 4.0, 121.0, 0.007463305415042708, 0.006666666666666667, 20098.33333333333)
+ .addRow(66.0, 1.0, "Urban", 101.0, "Kabul", 1.0, "Kabul", 774.0, 1.34556768E10, 2.0, "No", 1.0, "Yes", 1.0, "Yes", 2.0, "No", "", 2.0, "No", 2.0, "No", 2.0, "No", 1.0, "Good", 1.0, "Better", 1.0, "Working full time", 13.0, "Private Business Sole Proprietor", 4.0, 111.0, 0.017389288198469743, 0.006666666666666667, 8626.0)
+ .build();
+
+ assertEquals(3, results.rowCount());
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testExplicitQuery() throws Exception {
+ String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4";
+
+ QueryBuilder q = client.queryBuilder().sql(sql);
+ RowSet results = q.rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("ID", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Urban", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR)
+ .buildSchema();
+
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban")
+ .addRow(66.0, 1.0, "Urban")
+ .build();
+
+ assertEquals(3, results.rowCount());
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testSerDe() throws Exception {
+ String sql = "SELECT COUNT(*) FROM dfs.`spss/testdata.sav`";
+ String plan = queryBuilder().sql(sql).explainJson();
+ long cnt = queryBuilder().physical(plan).singletonLong();
+ assertEquals("Counts should match", 25L, cnt);
+ }
+
+ @Test
+ public void testExplicitQueryWithCompressedFile() throws Exception {
+ generateCompressedFile("spss/testdata.sav", "zip", "spss/testdata.sav.zip");
+
+ String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav.zip` WHERE d16=4";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("ID", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Urban", TypeProtos.MinorType.FLOAT8)
+ .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR)
+ .buildSchema();
+
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban")
+ .addRow(66.0, 1.0, "Urban")
+ .build();
+
+ assertEquals(3, results.rowCount());
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+}
diff --git a/contrib/format-spss/src/test/resources/spss/testdata.sav b/contrib/format-spss/src/test/resources/spss/testdata.sav
new file mode 100644
index 0000000..0351d9f
Binary files /dev/null and b/contrib/format-spss/src/test/resources/spss/testdata.sav differ
diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.cc b/contrib/native/client/src/protobuf/UserBitShared.pb.cc
index ed13f3e..bac074a 100644
--- a/contrib/native/client/src/protobuf/UserBitShared.pb.cc
+++ b/contrib/native/client/src/protobuf/UserBitShared.pb.cc
@@ -956,7 +956,7 @@ const char descriptor_table_protodef_UserBitShared_2eproto[] PROTOBUF_SECTION_VA
"ATEMENT\020\005*\207\001\n\rFragmentState\022\013\n\007SENDING\020\000"
"\022\027\n\023AWAITING_ALLOCATION\020\001\022\013\n\007RUNNING\020\002\022\014"
"\n\010FINISHED\020\003\022\r\n\tCANCELLED\020\004\022\n\n\006FAILED\020\005\022"
- "\032\n\026CANCELLATION_REQUESTED\020\006*\367\n\n\020CoreOper"
+ "\032\n\026CANCELLATION_REQUESTED\020\006*\212\013\n\020CoreOper"
"atorType\022\021\n\rSINGLE_SENDER\020\000\022\024\n\020BROADCAST"
"_SENDER\020\001\022\n\n\006FILTER\020\002\022\022\n\016HASH_AGGREGATE\020"
"\003\022\r\n\tHASH_JOIN\020\004\022\016\n\nMERGE_JOIN\020\005\022\031\n\025HASH"
@@ -991,11 +991,12 @@ const char descriptor_table_protodef_UserBitShared_2eproto[] PROTOBUF_SECTION_VA
"MERGE\020=\022\021\n\rLTSV_SUB_SCAN\020>\022\021\n\rHDF5_SUB_S"
"CAN\020\?\022\022\n\016EXCEL_SUB_SCAN\020@\022\020\n\014SHP_SUB_SCA"
"N\020A\022\024\n\020METADATA_HANDLER\020B\022\027\n\023METADATA_CO"
- "NTROLLER\020C\022\021\n\rHTTP_SUB_SCAN\020F*g\n\nSaslSta"
- "tus\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSASL_START\020\001\022\024\n"
- "\020SASL_IN_PROGRESS\020\002\022\020\n\014SASL_SUCCESS\020\003\022\017\n"
- "\013SASL_FAILED\020\004B.\n\033org.apache.drill.exec."
- "protoB\rUserBitSharedH\001"
+ "NTROLLER\020C\022\021\n\rSPSS_SUB_SCAN\020E\022\021\n\rHTTP_SU"
+ "B_SCAN\020F*g\n\nSaslStatus\022\020\n\014SASL_UNKNOWN\020\000"
+ "\022\016\n\nSASL_START\020\001\022\024\n\020SASL_IN_PROGRESS\020\002\022\020"
+ "\n\014SASL_SUCCESS\020\003\022\017\n\013SASL_FAILED\020\004B.\n\033org"
+ ".apache.drill.exec.protoB\rUserBitSharedH"
+ "\001"
;
static const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable*const descriptor_table_UserBitShared_2eproto_deps[3] = {
&::descriptor_table_Coordination_2eproto,
@@ -1029,7 +1030,7 @@ static ::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase*const descriptor_table_Use
static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_UserBitShared_2eproto_once;
static bool descriptor_table_UserBitShared_2eproto_initialized = false;
const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_UserBitShared_2eproto = {
- &descriptor_table_UserBitShared_2eproto_initialized, descriptor_table_protodef_UserBitShared_2eproto, "UserBitShared.proto", 5782,
+ &descriptor_table_UserBitShared_2eproto_initialized, descriptor_table_protodef_UserBitShared_2eproto, "UserBitShared.proto", 5801,
&descriptor_table_UserBitShared_2eproto_once, descriptor_table_UserBitShared_2eproto_sccs, descriptor_table_UserBitShared_2eproto_deps, 22, 3,
schemas, file_default_instances, TableStruct_UserBitShared_2eproto::offsets,
file_level_metadata_UserBitShared_2eproto, 22, file_level_enum_descriptors_UserBitShared_2eproto, file_level_service_descriptors_UserBitShared_2eproto,
@@ -1265,6 +1266,7 @@ bool CoreOperatorType_IsValid(int value) {
case 65:
case 66:
case 67:
+ case 69:
case 70:
return true;
default:
diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.h b/contrib/native/client/src/protobuf/UserBitShared.pb.h
index 0cc48d8..95cdb20 100644
--- a/contrib/native/client/src/protobuf/UserBitShared.pb.h
+++ b/contrib/native/client/src/protobuf/UserBitShared.pb.h
@@ -390,6 +390,7 @@ enum CoreOperatorType : int {
SHP_SUB_SCAN = 65,
METADATA_HANDLER = 66,
METADATA_CONTROLLER = 67,
+ SPSS_SUB_SCAN = 69,
HTTP_SUB_SCAN = 70
};
bool CoreOperatorType_IsValid(int value);
diff --git a/contrib/pom.xml b/contrib/pom.xml
index 15f3dd5..d81f717 100644
--- a/contrib/pom.xml
+++ b/contrib/pom.xml
@@ -48,6 +48,7 @@
<module>format-excel</module>
<module>format-esri</module>
<module>format-hdf5</module>
+ <module>format-spss</module>
<module>storage-hive</module>
<module>storage-mongo</module>
<module>storage-jdbc</module>
diff --git a/distribution/pom.xml b/distribution/pom.xml
index ddffb44..8b8369a 100644
--- a/distribution/pom.xml
+++ b/distribution/pom.xml
@@ -349,6 +349,11 @@
</dependency>
<dependency>
<groupId>org.apache.drill.contrib</groupId>
+ <artifactId>drill-format-spss</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.drill.contrib</groupId>
<artifactId>drill-format-ltsv</artifactId>
<version>${project.version}</version>
</dependency>
diff --git a/distribution/src/assemble/component.xml b/distribution/src/assemble/component.xml
index 05fd203..01e371e 100644
--- a/distribution/src/assemble/component.xml
+++ b/distribution/src/assemble/component.xml
@@ -47,6 +47,7 @@
<include>org.apache.drill.contrib:drill-format-hdf5:jar</include>
<include>org.apache.drill.contrib:drill-format-ltsv:jar</include>
<include>org.apache.drill.contrib:drill-format-excel:jar</include>
+ <include>org.apache.drill.contrib:drill-format-spss:jar</include>
<include>org.apache.drill.contrib:drill-jdbc-storage:jar</include>
<include>org.apache.drill.contrib:drill-kudu-storage:jar</include>
<include>org.apache.drill.contrib:drill-storage-kafka:jar</include>
diff --git a/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java b/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java
index 7ef6198..43f1396 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java
@@ -17,12 +17,12 @@
*/
package org.apache.drill.test;
-import java.io.IOException;
-
import org.apache.drill.common.AutoCloseables;
import org.junit.AfterClass;
import org.junit.ClassRule;
+import java.io.IOException;
+
/**
* Base class for tests that use a single cluster fixture for a set of
* tests. Extend your test case directly from {@link DrillTest} if you
diff --git a/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java b/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java
index 0d3e436..86ec9ca 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java
@@ -17,7 +17,10 @@
*/
package org.apache.drill.test;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
import java.net.BindException;
import java.net.ServerSocket;
import java.util.List;
@@ -27,6 +30,7 @@ import java.util.regex.Pattern;
import org.apache.drill.common.config.DrillConfig;
import org.apache.drill.exec.ExecConstants;
+import org.apache.drill.exec.ExecTest;
import org.apache.drill.exec.client.DrillClient;
import org.apache.drill.exec.client.LoggingResultsListener;
import org.apache.drill.exec.client.QuerySubmitter.Format;
@@ -43,10 +47,21 @@ import org.apache.drill.exec.server.RemoteServiceSet;
import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.server.options.OptionValue;
import org.apache.drill.exec.server.options.SystemOptionManager;
+import org.apache.drill.exec.store.dfs.ZipCodec;
import org.apache.drill.exec.util.VectorUtil;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static org.apache.drill.test.ClusterTest.dirTestWatcher;
+import static org.junit.Assert.assertNotNull;
+
/**
* Utilities useful for tests that issue SQL queries.
*/
@@ -312,4 +327,29 @@ public class QueryTestUtil {
throw new BindException(String.format("Free port could not be found in the range [%s-%s].\n" +
"Please release any of used ports in this range.", portNumber, portNumber + numberOfAttempts));
}
+
+ /**
+ * Generates a compressed version of the file for testing
+ * @param fileName Name of the input file
+ * @param codecName The desired CODEC to be used.
+ * @param outFileName Name of generated compressed file
+ * @throws IOException If function cannot generate file, throws IOException
+ */
+ public static void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
+ FileSystem fs = ExecTest.getLocalFileSystem();
+ Configuration conf = fs.getConf();
+ conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY, ZipCodec.class.getCanonicalName());
+ CompressionCodecFactory factory = new CompressionCodecFactory(conf);
+
+ CompressionCodec codec = factory.getCodecByName(codecName);
+ assertNotNull(codecName + " is not found", codec);
+
+ Path outFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), outFileName);
+ Path inFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), fileName);
+
+ try (InputStream inputStream = new FileInputStream(inFile.toUri().toString());
+ OutputStream outputStream = codec.createOutputStream(fs.create(outFile))) {
+ IOUtils.copyBytes(inputStream, outputStream, fs.getConf(), false);
+ }
+ }
}
diff --git a/pom.xml b/pom.xml
index 9d99fc5..5329d58 100644
--- a/pom.xml
+++ b/pom.xml
@@ -170,7 +170,7 @@
<repository>
<id>conjars</id>
<name>Conjars</name>
- <url>http://conjars.org/repo</url>
+ <url>https://conjars.org/repo</url>
<layout>default</layout>
<releases>
<enabled>true</enabled>
@@ -359,6 +359,7 @@
<exclude>**/*.pcap</exclude>
<exclude>**/*.log1</exclude>
<exclude>**/*.log2</exclude>
+ <exclude>**/*.sav</exclude>
<exclude>**/*.h5</exclude>
<exclude>**/*.sqllog</exclude>
<exclude>**/*.sqllog2</exclude>
@@ -665,6 +666,7 @@
<exclude>**/*.woff2</exclude>
<exclude>**/*.ks</exclude>
<exclude>**/*.pcap</exclude>
+ <exclude>**/*.sav</exclude>
<exclude>**/*.log1</exclude>
<exclude>**/*.log2</exclude>
<exclude>**/*.h5</exclude>
diff --git a/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java b/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
index 44906c3..1510a46 100644
--- a/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
+++ b/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
@@ -686,6 +686,10 @@ public final class UserBitShared {
*/
METADATA_CONTROLLER(67),
/**
+ * <code>SPSS_SUB_SCAN = 69;</code>
+ */
+ SPSS_SUB_SCAN(69),
+ /**
* <code>HTTP_SUB_SCAN = 70;</code>
*/
HTTP_SUB_SCAN(70),
@@ -964,6 +968,10 @@ public final class UserBitShared {
*/
public static final int METADATA_CONTROLLER_VALUE = 67;
/**
+ * <code>SPSS_SUB_SCAN = 69;</code>
+ */
+ public static final int SPSS_SUB_SCAN_VALUE = 69;
+ /**
* <code>HTTP_SUB_SCAN = 70;</code>
*/
public static final int HTTP_SUB_SCAN_VALUE = 70;
@@ -1057,6 +1065,7 @@ public final class UserBitShared {
case 65: return SHP_SUB_SCAN;
case 66: return METADATA_HANDLER;
case 67: return METADATA_CONTROLLER;
+ case 69: return SPSS_SUB_SCAN;
case 70: return HTTP_SUB_SCAN;
default: return null;
}
@@ -29037,7 +29046,7 @@ public final class UserBitShared {
"ATEMENT\020\005*\207\001\n\rFragmentState\022\013\n\007SENDING\020\000" +
"\022\027\n\023AWAITING_ALLOCATION\020\001\022\013\n\007RUNNING\020\002\022\014" +
"\n\010FINISHED\020\003\022\r\n\tCANCELLED\020\004\022\n\n\006FAILED\020\005\022" +
- "\032\n\026CANCELLATION_REQUESTED\020\006*\367\n\n\020CoreOper" +
+ "\032\n\026CANCELLATION_REQUESTED\020\006*\212\013\n\020CoreOper" +
"atorType\022\021\n\rSINGLE_SENDER\020\000\022\024\n\020BROADCAST" +
"_SENDER\020\001\022\n\n\006FILTER\020\002\022\022\n\016HASH_AGGREGATE\020" +
"\003\022\r\n\tHASH_JOIN\020\004\022\016\n\nMERGE_JOIN\020\005\022\031\n\025HASH" +
@@ -29072,11 +29081,12 @@ public final class UserBitShared {
"MERGE\020=\022\021\n\rLTSV_SUB_SCAN\020>\022\021\n\rHDF5_SUB_S" +
"CAN\020?\022\022\n\016EXCEL_SUB_SCAN\020@\022\020\n\014SHP_SUB_SCA" +
"N\020A\022\024\n\020METADATA_HANDLER\020B\022\027\n\023METADATA_CO" +
- "NTROLLER\020C\022\021\n\rHTTP_SUB_SCAN\020F*g\n\nSaslSta" +
- "tus\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSASL_START\020\001\022\024\n" +
- "\020SASL_IN_PROGRESS\020\002\022\020\n\014SASL_SUCCESS\020\003\022\017\n" +
- "\013SASL_FAILED\020\004B.\n\033org.apache.drill.exec." +
- "protoB\rUserBitSharedH\001"
+ "NTROLLER\020C\022\021\n\rSPSS_SUB_SCAN\020E\022\021\n\rHTTP_SU" +
+ "B_SCAN\020F*g\n\nSaslStatus\022\020\n\014SASL_UNKNOWN\020\000" +
+ "\022\016\n\nSASL_START\020\001\022\024\n\020SASL_IN_PROGRESS\020\002\022\020" +
+ "\n\014SASL_SUCCESS\020\003\022\017\n\013SASL_FAILED\020\004B.\n\033org" +
+ ".apache.drill.exec.protoB\rUserBitSharedH" +
+ "\001"
};
descriptor = com.google.protobuf.Descriptors.FileDescriptor
.internalBuildGeneratedFileFrom(descriptorData,
diff --git a/protocol/src/main/protobuf/UserBitShared.proto b/protocol/src/main/protobuf/UserBitShared.proto
index c51cc66..3b99255 100644
--- a/protocol/src/main/protobuf/UserBitShared.proto
+++ b/protocol/src/main/protobuf/UserBitShared.proto
@@ -379,6 +379,7 @@ enum CoreOperatorType {
SHP_SUB_SCAN = 65;
METADATA_HANDLER = 66;
METADATA_CONTROLLER = 67;
+ SPSS_SUB_SCAN = 69;
HTTP_SUB_SCAN = 70;
}