You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2018/02/09 16:43:43 UTC
any23 git commit: ANY23-264 Upgrade to use public commons-csv instead
of custom SNAPSHOT
Repository: any23
Updated Branches:
refs/heads/master 5a3ba9d1f -> 7a7db2006
ANY23-264 Upgrade to use public commons-csv instead of custom SNAPSHOT
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/7a7db200
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/7a7db200
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/7a7db200
Branch: refs/heads/master
Commit: 7a7db20069da63bb1312dd12820c2f2ca301856f
Parents: 5a3ba9d
Author: Hans <fi...@gmail.com>
Authored: Thu Feb 8 23:27:30 2018 -0600
Committer: Hans <fi...@gmail.com>
Committed: Thu Feb 8 23:27:30 2018 -0600
----------------------------------------------------------------------
.../any23/extractor/csv/CSVExtractor.java | 25 ++++++---
.../any23/extractor/csv/CSVReaderBuilder.java | 59 +++++++++-----------
pom.xml | 2 +-
3 files changed, 43 insertions(+), 43 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java b/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
index e72162b..298d930 100644
--- a/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java
@@ -28,6 +28,7 @@ import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.CSV;
import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
@@ -38,6 +39,7 @@ import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
import java.io.IOException;
import java.io.InputStream;
import java.util.StringTokenizer;
+import java.util.Iterator;
/**
* This extractor produces <i>RDF</i> from a <i>CSV file</i> .
@@ -77,17 +79,18 @@ public class CSVExtractor implements Extractor.ContentExtractor {
// build the parser
csvParser = CSVReaderBuilder.build(in);
+ Iterator<CSVRecord> rows = csvParser.iterator();
// get the header and generate the IRIs for column names
- String[] header = csvParser.getLine();
+ CSVRecord header = rows.hasNext() ? rows.next() : null;
headerIRIs = processHeader(header, documentIRI);
// write triples to describe properties
writeHeaderPropertiesMetadata(header, out);
- String[] nextLine;
int index = 0;
- while ((nextLine = csvParser.getLine()) != null) {
+ while (rows.hasNext()) {
+ CSVRecord nextLine = rows.next();
IRI rowSubject = RDFUtils.iri(
documentIRI.toString(),
"row/" + index
@@ -151,17 +154,18 @@ public class CSVExtractor implements Extractor.ContentExtractor {
* @param header
* @param out
*/
- private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
+ private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
int index = 0;
for (IRI singleHeader : headerIRIs) {
if (index > headerIRIs.length) {
break;
}
- if (!RDFUtils.isAbsoluteIRI(header[index])) {
+ String headerString = header.get(index);
+ if (!RDFUtils.isAbsoluteIRI(headerString)) {
out.writeTriple(
singleHeader,
RDFS.LABEL,
- SimpleValueFactory.getInstance().createLiteral(header[index])
+ SimpleValueFactory.getInstance().createLiteral(headerString)
);
}
out.writeTriple(
@@ -181,8 +185,11 @@ public class CSVExtractor implements Extractor.ContentExtractor {
* @param header
* @return an array of {@link IRI}s identifying the column names.
*/
- private IRI[] processHeader(String[] header, IRI documentIRI) {
- IRI[] result = new IRI[header.length];
+ private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
+ if (header == null)
+ return new IRI[0];
+
+ IRI[] result = new IRI[header.size()];
int index = 0;
for (String h : header) {
String candidate = h.trim();
@@ -222,7 +229,7 @@ public class CSVExtractor implements Extractor.ContentExtractor {
*/
private void produceRowStatements(
IRI rowSubject,
- String[] values,
+ CSVRecord values,
ExtractionResult out
) {
int index = 0;
http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
index 75bb583..87d764d 100644
--- a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
+++ b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
@@ -19,11 +19,13 @@ package org.apache.any23.extractor.csv;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVRecord;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.util.Iterator;
/**
* This class is responsible to build a reader first guessing the configuration
@@ -38,21 +40,19 @@ public class CSVReaderBuilder {
private static final String DEFAULT_COMMENT_DELIMITER = "#";
- public static final char NULL_CHAR = ' ';
-
private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
private static DefaultConfiguration defaultConfiguration =
DefaultConfiguration.singleton();
- private static final CSVStrategy[] strategies;
+ private static final CSVFormat[] strategies;
static {
- strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
- strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
+ strategies = new CSVFormat[popularDelimiters.length + 1];
+ strategies[0] = CSVFormat.DEFAULT;
int index = 1;
- for(char dlmt : popularDelimiters) {
- strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
+ for (char dlmt : popularDelimiters) {
+ strategies[index++] = CSVFormat.DEFAULT.withDelimiter(dlmt);
}
}
@@ -65,9 +65,10 @@ public class CSVReaderBuilder {
* @throws java.io.IOException
*/
public static CSVParser build(InputStream is) throws IOException {
- CSVStrategy bestStrategy = getBestStrategy(is);
- if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
- return new CSVParser( new InputStreamReader(is), bestStrategy );
+ CSVFormat bestStrategy = getBestStrategy(is);
+ if (bestStrategy == null)
+ bestStrategy = getCSVStrategyFromConfiguration();
+ return new CSVParser(new InputStreamReader(is), bestStrategy);
}
/**
@@ -82,20 +83,16 @@ public class CSVReaderBuilder {
return getBestStrategy(is) != null;
}
- private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
- for( CSVStrategy strategy : strategies ) {
- if( testStrategy(is, strategy) ) {
+ private static CSVFormat getBestStrategy(InputStream is) throws IOException {
+ for (CSVFormat strategy : strategies) {
+ if (testStrategy(is, strategy)) {
return strategy;
}
}
return null;
}
- private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
- return new CSVStrategy(delimiter, '\'', comment);
- }
-
- private static CSVStrategy getCSVStrategyFromConfiguration() {
+ private static CSVFormat getCSVStrategyFromConfiguration() {
char fieldDelimiter = getCharValueFromConfiguration(
"any23.extraction.csv.field",
DEFAULT_FIELD_DELIMITER
@@ -104,7 +101,7 @@ public class CSVReaderBuilder {
"any23.extraction.csv.comment",
DEFAULT_COMMENT_DELIMITER
);
- return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
+ return CSVFormat.DEFAULT.withDelimiter(fieldDelimiter).withCommentMarker(commentDelimiter);
}
private static char getCharValueFromConfiguration(String property, String defaultValue) {
@@ -112,7 +109,7 @@ public class CSVReaderBuilder {
property,
defaultValue
);
- if (delimiter.length() != 1 || delimiter.equals("")) {
+ if (delimiter.length() != 1) {
throw new RuntimeException(property + " value must be a single character");
}
return delimiter.charAt(0);
@@ -128,29 +125,25 @@ public class CSVReaderBuilder {
* @throws IOException
* @param is
*/
- private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
+ private static boolean testStrategy(InputStream is, CSVFormat strategy) throws IOException {
final int MIN_COLUMNS = 2;
is.mark(Integer.MAX_VALUE);
try {
- final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
+ final Iterator<CSVRecord> rows = new CSVParser(new InputStreamReader(is), strategy).iterator();
int linesToCheck = 5;
int headerColumnCount = -1;
- while (linesToCheck > 0) {
- String[] row;
- row = parser.getLine();
- if (row == null) {
- break;
- }
- if (row.length < MIN_COLUMNS) {
+ while (linesToCheck > 0 && rows.hasNext()) {
+ int rowLength = rows.next().size();
+ if (rowLength < MIN_COLUMNS) {
return false;
}
if (headerColumnCount == -1) { // first row
- headerColumnCount = row.length;
+ headerColumnCount = rowLength;
} else { // make sure rows have the same number of columns or one more than the header
- if (row.length < headerColumnCount) {
+ if (rowLength < headerColumnCount) {
return false;
- } else if (row.length - 1 > headerColumnCount) {
+ } else if (rowLength - 1 > headerColumnCount) {
return false;
}
}
http://git-wip-us.apache.org/repos/asf/any23/blob/7a7db200/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 4455cd1..14f5ee2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -516,7 +516,7 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
- <version>1.0-SNAPSHOT-rev1148315</version>
+ <version>1.5</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>