You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@metamodel.apache.org by ka...@apache.org on 2019/03/25 02:29:57 UTC

[metamodel] 03/09: METAMODEL-1210: Added support for applying date format.

This is an automated email from the ASF dual-hosted git repository.

kaspersor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/metamodel.git

commit fa98e2958709a85b53280e5cb592c820d0a779e8
Author: Kasper Sørensen <i....@gmail.com>
AuthorDate: Wed Mar 13 03:10:55 2019 -0700

    METAMODEL-1210: Added support for applying date format.
---
 .../org/apache/metamodel/arff/ArffDataContext.java |  6 +-
 .../org/apache/metamodel/arff/ArffDataSet.java     | 72 +++++++++++++---------
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/arff/src/main/java/org/apache/metamodel/arff/ArffDataContext.java b/arff/src/main/java/org/apache/metamodel/arff/ArffDataContext.java
index 6d4f5ed..0b4ca5d 100644
--- a/arff/src/main/java/org/apache/metamodel/arff/ArffDataContext.java
+++ b/arff/src/main/java/org/apache/metamodel/arff/ArffDataContext.java
@@ -55,7 +55,7 @@ public class ArffDataContext extends QueryPostprocessDataContext {
     private static final String SECTION_ANNOTATION_DATA = "@data";
     private static final Charset CHARSET = FileHelper.UTF_8_CHARSET;
     private static final Pattern ATTRIBUTE_DEF_W_DATATYPE_PARAM =
-            Pattern.compile("\\'?(.+)\\'?\\s+([a-zA-Z]+)\\s+\\'?(.+)\\'?");
+            Pattern.compile("\\'?(.+)\\'?\\s+([a-zA-Z]+)\\s+\\'(.+)\\'");
 
     private final Splitter whitespaceSplitter = Splitter.on(CharMatcher.whitespace()).trimResults().omitEmptyStrings();
 
@@ -188,7 +188,7 @@ public class ArffDataContext extends QueryPostprocessDataContext {
                 if (isIgnoreLine(line)) {
                     continue;
                 }
-                if (line.equals(SECTION_ANNOTATION_DATA)) {
+                if (line.toLowerCase().equals(SECTION_ANNOTATION_DATA)) {
                     // start of the data
                     break;
                 }
@@ -196,7 +196,7 @@ public class ArffDataContext extends QueryPostprocessDataContext {
         } catch (IOException e) {
             throw new UncheckedIOException(e);
         }
-        final ArffDataSet dataSet = new ArffDataSet(columns, reader);
+        final ArffDataSet dataSet = new ArffDataSet(resource, columns, reader);
         if (maxRows > -1) {
             return new MaxRowsDataSet(dataSet, maxRows);
         } else {
diff --git a/arff/src/main/java/org/apache/metamodel/arff/ArffDataSet.java b/arff/src/main/java/org/apache/metamodel/arff/ArffDataSet.java
index abf1e5d..2d1df07 100644
--- a/arff/src/main/java/org/apache/metamodel/arff/ArffDataSet.java
+++ b/arff/src/main/java/org/apache/metamodel/arff/ArffDataSet.java
@@ -15,6 +15,7 @@ import org.apache.metamodel.query.SelectItem;
 import org.apache.metamodel.schema.Column;
 import org.apache.metamodel.schema.ColumnType;
 import org.apache.metamodel.util.NumberComparator;
+import org.apache.metamodel.util.Resource;
 
 import com.opencsv.CSVParser;
 import com.opencsv.ICSVParser;
@@ -22,16 +23,16 @@ import com.opencsv.ICSVParser;
 public class ArffDataSet extends AbstractDataSet {
 
     private final ICSVParser csvParser = new CSVParser(',', '\'');
+    private final Resource resource;
     private final BufferedReader reader;
-    private final int[] valueIndices;
-    private final ColumnType[] valueTypes;
+    private final List<Column> columns;
 
     private String line;
 
-    public ArffDataSet(List<Column> columns, BufferedReader reader) {
+    public ArffDataSet(Resource resource, List<Column> columns, BufferedReader reader) {
         super(columns.stream().map(c -> new SelectItem(c)).collect(Collectors.toList()));
-        this.valueIndices = columns.stream().mapToInt(Column::getColumnNumber).toArray();
-        this.valueTypes = columns.stream().map(Column::getType).toArray(ColumnType[]::new);
+        this.resource = resource;
+        this.columns = columns;
         this.reader = reader;
     }
 
@@ -57,38 +58,51 @@ public class ArffDataSet extends AbstractDataSet {
         try {
             stringValues = csvParser.parseLine(line);
         } catch (IOException e) {
-            throw new UncheckedIOException(e);
+            throw new UncheckedIOException(resource.getName() + ": Failed to CSV-parse data line: " + line, e);
         }
 
-        final Object[] values = new Object[valueIndices.length];
-        for (int i = 0; i < valueIndices.length; i++) {
-            final int index = valueIndices[i];
+        final Object[] values = new Object[columns.size()];
+        for (int i = 0; i < values.length; i++) {
+            final Column column = columns.get(i);
+            final int index = column.getColumnNumber();
             final String stringValue = stringValues[index];
-            final ColumnType type = valueTypes[i];
-            if (type.isNumber()) {
-                if (stringValue.isEmpty() || "?".equals(stringValue)) {
-                    values[i] = null;
+            values[i] = convertValue(stringValue, column);
+        }
+        return new DefaultRow(getHeader(), values);
+    }
+
+    private Object convertValue(String stringValue, Column column) {
+        final ColumnType type = column.getType();
+        if (type.isNumber()) {
+            if (stringValue.isEmpty() || "?".equals(stringValue)) {
+                return null;
+            } else {
+                final Number n = NumberComparator.toNumber(stringValue);
+                if (type == ColumnType.INTEGER) {
+                    return n.intValue();
                 } else {
-                    final Number n = NumberComparator.toNumber(stringValue);
-                    if (type == ColumnType.INTEGER) {
-                        values[i] = n.intValue();
-                    } else {
-                        values[i] = n;
-                    }
-                }
-            } else if (type.isTimeBased()) {
-                // TODO: extract format from column remarks
-                try {
-                    values[i] = new SimpleDateFormat("yyyy-MM-dd").parse(stringValue);
-                } catch (ParseException e) {
-                    throw new IllegalStateException(e);
+                    return n;
                 }
+            }
+        } else if (type.isTimeBased()) {
+            final String columnRemarks = column.getRemarks();
+            final SimpleDateFormat dateFormat;
+            if (columnRemarks.toLowerCase().startsWith("date ")) {
+                // date format follows "date "
+                dateFormat = new SimpleDateFormat(columnRemarks.substring(5));
             } else {
-                values[i] = stringValue;
+                // assume standard date format
+                dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+            }
+            try {
+                return dateFormat.parse(stringValue);
+            } catch (ParseException e) {
+                throw new IllegalStateException(resource.getName() + ": Failed to parse '" + stringValue
+                        + "' using format '" + dateFormat.toPattern() + "'", e);
             }
+        } else {
+            return stringValue;
         }
-
-        return new DefaultRow(getHeader(), values);
     }
 
     @Override