You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2018/02/09 16:43:43 UTC

any23 git commit: ANY23-264 Upgrade to use public commons-csv instead of custom SNAPSHOT

Repository: any23
Updated Branches:
  refs/heads/master 5a3ba9d1f -> 7a7db2006


ANY23-264 Upgrade to use public commons-csv instead of custom SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/7a7db200
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/7a7db200
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/7a7db200

Branch: refs/heads/master
Commit: 7a7db20069da63bb1312dd12820c2f2ca301856f
Parents: 5a3ba9d
Author: Hans <fi...@gmail.com>
Authored: Thu Feb 8 23:27:30 2018 -0600
Committer: Hans <fi...@gmail.com>
Committed: Thu Feb 8 23:27:30 2018 -0600

----------------------------------------------------------------------
 .../any23/extractor/csv/CSVExtractor.java       | 25 ++++++---
 .../any23/extractor/csv/CSVReaderBuilder.java   | 59 +++++++++-----------
 pom.xml                                         |  2 +-
 3 files changed, 43 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java b/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
index e72162b..298d930 100644
--- a/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
@@ -28,6 +28,7 @@ import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.CSV;
 import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Value;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
@@ -38,6 +39,7 @@ import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.StringTokenizer;
+import java.util.Iterator;
 
 /**
  * This extractor produces <i>RDF</i> from a <i>CSV file</i> .
@@ -77,17 +79,18 @@ public class CSVExtractor implements Extractor.ContentExtractor {
 
         // build the parser
         csvParser = CSVReaderBuilder.build(in);
+        Iterator<CSVRecord> rows = csvParser.iterator();
 
         // get the header and generate the IRIs for column names
-        String[] header = csvParser.getLine();
+        CSVRecord header = rows.hasNext() ? rows.next() : null;
         headerIRIs = processHeader(header, documentIRI);
 
         // write triples to describe properties
         writeHeaderPropertiesMetadata(header, out);
 
-        String[] nextLine;
         int index = 0;
-        while ((nextLine = csvParser.getLine()) != null) {
+        while (rows.hasNext()) {
+            CSVRecord nextLine = rows.next();
             IRI rowSubject = RDFUtils.iri(
                     documentIRI.toString(),
                     "row/" + index
@@ -151,17 +154,18 @@ public class CSVExtractor implements Extractor.ContentExtractor {
      * @param header
      * @param out
      */
-    private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
+    private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
         int index = 0;
         for (IRI singleHeader : headerIRIs) {
             if (index > headerIRIs.length) {
                 break;
             }
-            if (!RDFUtils.isAbsoluteIRI(header[index])) {
+            String headerString = header.get(index);
+            if (!RDFUtils.isAbsoluteIRI(headerString)) {
                 out.writeTriple(
                         singleHeader,
                         RDFS.LABEL,
-                        SimpleValueFactory.getInstance().createLiteral(header[index])
+                        SimpleValueFactory.getInstance().createLiteral(headerString)
                 );
             }
             out.writeTriple(
@@ -181,8 +185,11 @@ public class CSVExtractor implements Extractor.ContentExtractor {
      * @param header
      * @return an array of {@link IRI}s identifying the column names.
      */
-    private IRI[] processHeader(String[] header, IRI documentIRI) {
-        IRI[] result = new IRI[header.length];
+    private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
+        if (header == null)
+            return new IRI[0];
+
+        IRI[] result = new IRI[header.size()];
         int index = 0;
         for (String h : header) {
             String candidate = h.trim();
@@ -222,7 +229,7 @@ public class CSVExtractor implements Extractor.ContentExtractor {
      */
     private void produceRowStatements(
             IRI rowSubject,
-            String[] values,
+            CSVRecord values,
             ExtractionResult out
     ) {
         int index = 0;

http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
index 75bb583..87d764d 100644
--- a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
+++ b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
@@ -19,11 +19,13 @@ package org.apache.any23.extractor.csv;
 
 import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVRecord;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.util.Iterator;
 
 /**
  * This class is responsible to build a reader first guessing the configuration
@@ -38,21 +40,19 @@ public class CSVReaderBuilder {
 
     private static final String DEFAULT_COMMENT_DELIMITER = "#";
 
-    public static final char NULL_CHAR = ' ';
-
     private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
 
     private static DefaultConfiguration defaultConfiguration =
             DefaultConfiguration.singleton();
 
-    private static final CSVStrategy[] strategies;
+    private static final CSVFormat[] strategies;
 
     static {
-        strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
-        strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
+        strategies = new CSVFormat[popularDelimiters.length + 1];
+        strategies[0] = CSVFormat.DEFAULT;
         int index = 1;
-        for(char dlmt : popularDelimiters) {
-            strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
+        for (char dlmt : popularDelimiters) {
+            strategies[index++] = CSVFormat.DEFAULT.withDelimiter(dlmt);
         }
     }
 
@@ -65,9 +65,10 @@ public class CSVReaderBuilder {
      * @throws java.io.IOException
      */
     public static CSVParser build(InputStream is) throws IOException {
-        CSVStrategy bestStrategy = getBestStrategy(is);
-        if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
-        return new CSVParser( new InputStreamReader(is), bestStrategy );
+        CSVFormat bestStrategy = getBestStrategy(is);
+        if (bestStrategy == null)
+            bestStrategy = getCSVStrategyFromConfiguration();
+        return new CSVParser(new InputStreamReader(is), bestStrategy);
     }
 
     /**
@@ -82,20 +83,16 @@ public class CSVReaderBuilder {
         return getBestStrategy(is) != null;
     }
 
-    private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
-        for( CSVStrategy strategy : strategies ) {
-            if( testStrategy(is, strategy) ) {
+    private static CSVFormat getBestStrategy(InputStream is) throws IOException {
+        for (CSVFormat strategy : strategies) {
+            if (testStrategy(is, strategy)) {
                 return strategy;
             }
         }
         return null;
     }
 
-    private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
-        return new CSVStrategy(delimiter, '\'', comment);
-    }
-
-    private static CSVStrategy getCSVStrategyFromConfiguration() {
+    private static CSVFormat getCSVStrategyFromConfiguration() {
         char fieldDelimiter = getCharValueFromConfiguration(
                 "any23.extraction.csv.field",
                 DEFAULT_FIELD_DELIMITER
@@ -104,7 +101,7 @@ public class CSVReaderBuilder {
                 "any23.extraction.csv.comment",
                 DEFAULT_COMMENT_DELIMITER
         );
-        return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
+        return CSVFormat.DEFAULT.withDelimiter(fieldDelimiter).withCommentMarker(commentDelimiter);
     }
 
     private static char getCharValueFromConfiguration(String property, String defaultValue) {
@@ -112,7 +109,7 @@ public class CSVReaderBuilder {
                 property,
                 defaultValue
         );
-        if (delimiter.length() != 1 || delimiter.equals("")) {
+        if (delimiter.length() != 1) {
             throw new RuntimeException(property + " value must be a single character");
         }
         return delimiter.charAt(0);
@@ -128,29 +125,25 @@ public class CSVReaderBuilder {
      * @throws IOException
      * @param is
      */
-    private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
+    private static boolean testStrategy(InputStream is, CSVFormat strategy) throws IOException {
         final int MIN_COLUMNS = 2;
 
         is.mark(Integer.MAX_VALUE);
         try {
-            final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
+            final Iterator<CSVRecord> rows = new CSVParser(new InputStreamReader(is), strategy).iterator();
             int linesToCheck = 5;
             int headerColumnCount = -1;
-            while (linesToCheck > 0) {
-                String[] row;
-                row = parser.getLine();
-                if (row == null) {
-                    break;
-                }
-                if (row.length < MIN_COLUMNS) {
+            while (linesToCheck > 0 && rows.hasNext()) {
+                int rowLength = rows.next().size();
+                if (rowLength < MIN_COLUMNS) {
                     return false;
                 }
                 if (headerColumnCount == -1) { // first row
-                    headerColumnCount = row.length;
+                    headerColumnCount = rowLength;
                 } else { // make sure rows have the same number of columns or one more than the header
-                    if (row.length < headerColumnCount) {
+                    if (rowLength < headerColumnCount) {
                         return false;
-                    } else if (row.length - 1 > headerColumnCount) {
+                    } else if (rowLength - 1 > headerColumnCount) {
                         return false;
                     }
                 }

http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 4455cd1..14f5ee2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -516,7 +516,7 @@
       <dependency>
         <groupId>org.apache.commons</groupId>
         <artifactId>commons-csv</artifactId>
-        <version>1.0-SNAPSHOT-rev1148315</version>
+        <version>1.5</version>
       </dependency>
       <dependency>
         <groupId>commons-io</groupId>