You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by an...@apache.org on 2017/07/13 05:51:21 UTC

[1/6] any23 git commit: Fix ANY23-308

Repository: any23
Updated Branches:
  refs/heads/master 5bc7e46a8 -> b0baa9407


Fix ANY23-308

- validate yaml file
- rename csvutils -> utils
- bring all utility class into util module
- update README

Signed-off-by: Jacek Grzebyta <gr...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ae036a7a
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ae036a7a
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ae036a7a

Branch: refs/heads/master
Commit: ae036a7af2a8c5a5572b6e17832f69bd8f4b4ba4
Parents: bd69aef
Author: Jacek Grzebyta <gr...@gmail.com>
Authored: Tue Jul 11 11:57:16 2017 +0100
Committer: Jacek Grzebyta <gr...@gmail.com>
Committed: Tue Jul 11 11:57:16 2017 +0100

----------------------------------------------------------------------
 README.md                                       |   2 +-
 cli/pom.xml                                     |   2 +-
 .../org/apache/any23/cli/YAMLRoverTest.java     |  76 +++++++++
 core/pom.xml                                    |   7 +-
 .../any23/extractor/yaml/YAMLExtractor.java     |   7 +-
 .../any23/extractor/yaml/YAMLExtractorTest.java |  14 +-
 .../extractor/yaml/YAMLTikaParserTest.java      |  48 ++++++
 csvutils/pom.xml                                | 106 ------------
 .../any23/extractor/csv/CSVReaderBuilder.java   | 166 -------------------
 csvutils/src/test/resources/log4j.properties    |  34 ----
 mime/pom.xml                                    |   2 +-
 .../apache/any23/mime/TikaMIMETypeDetector.java |  17 +-
 pom.xml                                         |   7 +-
 utils/pom.xml                                   | 123 ++++++++++++++
 .../any23/extractor/csv/CSVReaderBuilder.java   | 166 +++++++++++++++++++
 .../any23/extractor/yaml/YAMLValidator.java     | 105 ++++++++++++
 .../any23/yaml/utils/YAMLValidatorTest.java     |  66 ++++++++
 utils/src/test/resources/log4j.properties       |  35 ++++
 18 files changed, 659 insertions(+), 324 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 9db7126..6c52061 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Any23 documentation can be found on the [website](http://any23.apache.org)
 
  * [api](https://github.com/lewismc/any23/tree/master/api): Any23 library external API.
  * [core](https://github.com/lewismc/any23/tree/master/core): The library core codebase.
- * [csvutils](https://github.com/lewismc/any23/tree/master/csvutils): A CSV specific package
+ * [utils](https://github.com/lewismc/any23/tree/master/utils): An utilities package
  * [encoding](https://github.com/lewismc/any23/tree/master/encoding): Encoding detection library.
  * [mime](https://github.com/lewismc/any23/tree/master/mime): MIME Type detection library.
  * [nquads](https://github.com/lewismc/any23/tree/master/nquads): NQuads parsing and serialization library.

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
index 5acedfb..47b9c06 100644
--- a/cli/pom.xml
+++ b/cli/pom.xml
@@ -50,7 +50,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-csvutils</artifactId>
+      <artifactId>apache-any23-utils</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
new file mode 100644
index 0000000..17e8916
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.cli;
+
+import com.google.common.io.Files;
+import java.io.File;
+import java.io.IOException;
+import org.apache.pdfbox.util.Charsets;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Unit test for issue ANY23-308
+ *
+ * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com)
+ */
+public class YAMLRoverTest extends ToolTestBase {
+
+    private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml";
+
+    private static final String baseUri = "urn:test";
+
+    private final Logger log = LoggerFactory.getLogger(getClass());
+
+    public YAMLRoverTest() {
+        super(Rover.class);
+    }
+
+    @Test
+    public void simpleTest()
+            throws Exception {
+        File outputFile = File.createTempFile("rover-test", ".ttl", tempDirectory);
+        File logfile = File.createTempFile("test-log", ".txt", tempDirectory);
+
+        int exitCode = runTool(String.format("-l %s -o %s -f turtle -e yaml,csv -d %s %s",
+                logfile.getAbsolutePath(),
+                outputFile.getAbsolutePath(),
+                baseUri,
+                copyResourceToTempFile(file1).getAbsolutePath()));
+
+        Assert.assertTrue(logfile.exists());
+        log.debug("Log file location: {}", logfile.getAbsolutePath());
+        log.info("Log file content: \n{}\n", Files.toString(logfile, Charsets.UTF_8));
+
+        Assert.assertEquals("Unexpected exit code.", 0, exitCode);
+        assertFileContainsString(outputFile, baseUri);
+    }
+
+    /**
+     *
+     * @param f
+     * @param s Expected string in the file
+     * @return
+     */
+    public void assertFileContainsString(File f, String s) throws IOException {
+        String fileContent = Files.toString(f, Charsets.UTF_8);
+        log.trace("File content: \n{}\n", fileContent);
+        Assert.assertTrue(fileContent.contains(s));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index f03c672..c410799 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -38,7 +38,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-csvutils</artifactId>
+      <artifactId>apache-any23-utils</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
@@ -78,11 +78,6 @@
       <groupId>com.beust</groupId>
       <artifactId>jcommander</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.yaml</groupId>
-      <artifactId>snakeyaml</artifactId>
-      <version>1.17</version>
-    </dependency>
 
     <!-- BEGIN: Tika -->
     <dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
index 64548f1..5c73082 100644
--- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
@@ -17,8 +17,6 @@ package org.apache.any23.extractor.yaml;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -31,7 +29,6 @@ import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.util.StringUtils;
 import org.apache.any23.vocab.YAML;
-import org.apache.commons.lang.WordUtils;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Value;
@@ -54,7 +51,7 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
 
     private int nodeId = 0;
 
-    private IRI documentRoot;
+    private Resource documentRoot;
 
     @Override
     public void setStopAtFirstError(boolean f) {
@@ -65,7 +62,7 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
             ExtractionResult out)
             throws IOException, ExtractionException {
         IRI documentURI = context.getDocumentIRI();
-        documentRoot = RDFUtils.uri(documentURI.toString() + "root");
+        documentRoot = makeUri("root", documentURI, false);
 
         log.debug("process: {}", documentURI.toString());
         out.writeNamespace(vocab.PREFIX, vocab.NS);

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
index 0cf8d14..b265c5f 100644
--- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
@@ -27,7 +27,6 @@ import org.eclipse.rdf4j.model.Statement;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.eclipse.rdf4j.model.vocabulary.RDFS;
 import org.eclipse.rdf4j.repository.RepositoryResult;
-import org.semarglproject.vocab.XSD;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -93,4 +92,17 @@ public class YAMLExtractorTest extends AbstractExtractorTestCase {
         RepositoryResult<Statement> docs = getStatements(null, null, RDF.NIL);
         Assert.assertTrue(Iterations.asList(docs).size() == 2);
     }
+
+    /**
+     * Comma separated values are parsed as well.
+     *
+     * @throws Exception
+     */
+    @Test
+    public void csvTest()
+            throws Exception {
+        assertExtract("/org/apache/any23/extractor/csv/test-comma.csv");
+        log.debug(dumpModelToTurtle());
+        assertModelNotEmpty();
+    }
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
new file mode 100644
index 0000000..4727c84
--- /dev/null
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.yaml;
+
+import java.io.InputStream;
+import org.apache.any23.mime.MIMEType;
+import org.apache.any23.mime.TikaMIMETypeDetector;
+import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author jacek
+ */
+public class YAMLTikaParserTest {
+
+    private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml";
+
+    private final Logger log = LoggerFactory.getLogger(getClass());
+
+    @Test
+    public void tikaDetect()
+            throws Exception {
+        InputStream is = YAMLTikaParserTest.class.getResourceAsStream(file1);
+        TikaMIMETypeDetector detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
+        MIMEType type = detector.guessMIMEType(null, is, null);
+
+        log.info("Type: {}", type.toString());
+
+        Assert.assertEquals("text/x-yaml", type.toString());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/pom.xml
----------------------------------------------------------------------
diff --git a/csvutils/pom.xml b/csvutils/pom.xml
deleted file mode 100644
index 8f5b18d..0000000
--- a/csvutils/pom.xml
+++ /dev/null
@@ -1,106 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <artifactId>apache-any23</artifactId>
-    <groupId>org.apache.any23</groupId>
-    <version>2.1-SNAPSHOT</version>
-    <relativePath>..</relativePath>
-  </parent>
-
-  <artifactId>apache-any23-csvutils</artifactId>
-
-  <name>Apache Any23 :: CSV Utilities</name>
-  <description>CSV specific library.</description>
-
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-api</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-csv</artifactId>
-    </dependency>
-    <!-- Logging -->
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-      <version>${slf4j.logger.version}</version>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <resources>
-      <resource>
-        <directory>${basedir}/../</directory>
-        <targetPath>META-INF</targetPath>
-        <includes>
-          <include>LICENSE.txt</include>
-          <include>NOTICE.txt</include>
-        </includes>
-      </resource>
-    </resources>
-    <pluginManagement>
-      <plugins>
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-          <artifactId>maven-assembly-plugin</artifactId>
-          <version>${maven-assembly-plugin.version}</version>
-          <executions>
-            <execution>
-              <id>assembly</id>
-              <phase>package</phase>
-              <goals>
-                <goal>single</goal>
-              </goals>
-            </execution>
-          </executions>
-          <configuration>
-            <attach>true</attach>
-            <skipAssembly>true</skipAssembly>
-            <tarLongFileMode>gnu</tarLongFileMode>
-          </configuration>
-        </plugin>
-      </plugins>
-    </pluginManagement>
-  </build>
-
-  <profiles>
-    <profile>
-      <id>release</id>
-      <build>
-        <resources>
-          <resource>
-            <directory>${basedir}/../</directory>
-            <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
-            <includes>
-              <include>LICENSE.txt</include>
-              <include>NOTICE.txt</include>
-            </includes>
-          </resource>
-        </resources>
-      </build>
-    </profile>
-  </profiles>
-
-</project>

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
deleted file mode 100644
index 75bb583..0000000
--- a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.any23.extractor.csv;
-
-import org.apache.any23.configuration.DefaultConfiguration;
-import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-
-/**
- * This class is responsible to build a reader first guessing the configuration
- * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
- *
- * @author Davide Palmisano ( dpalmisano@gmail.com )
- * @author Michele Mostarda ( michele.mostarda@gmail.com )
- */
-public class CSVReaderBuilder {
-
-    private static final String DEFAULT_FIELD_DELIMITER = ",";
-
-    private static final String DEFAULT_COMMENT_DELIMITER = "#";
-
-    public static final char NULL_CHAR = ' ';
-
-    private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
-
-    private static DefaultConfiguration defaultConfiguration =
-            DefaultConfiguration.singleton();
-
-    private static final CSVStrategy[] strategies;
-
-    static {
-        strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
-        strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
-        int index = 1;
-        for(char dlmt : popularDelimiters) {
-            strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
-        }
-    }
-
-    /**
-     * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
-     * from the provided <i>CSV</i> file.
-     *
-     * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
-     * @return a {@link CSVParser}
-     * @throws java.io.IOException
-     */
-    public static CSVParser build(InputStream is) throws IOException {
-        CSVStrategy bestStrategy = getBestStrategy(is);
-        if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
-        return new CSVParser( new InputStreamReader(is), bestStrategy );
-    }
-
-    /**
-     * Checks whether the given input stream is a CSV or not.
-     *
-     * @param is input stream to be verified.
-     * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
-     *         <code>false</code> otherwise.
-     * @throws IOException
-     */
-    public static boolean isCSV(InputStream is) throws IOException {
-        return getBestStrategy(is) != null;
-    }
-
-    private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
-        for( CSVStrategy strategy : strategies ) {
-            if( testStrategy(is, strategy) ) {
-                return strategy;
-            }
-        }
-        return null;
-    }
-
-    private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
-        return new CSVStrategy(delimiter, '\'', comment);
-    }
-
-    private static CSVStrategy getCSVStrategyFromConfiguration() {
-        char fieldDelimiter = getCharValueFromConfiguration(
-                "any23.extraction.csv.field",
-                DEFAULT_FIELD_DELIMITER
-        );
-        char commentDelimiter = getCharValueFromConfiguration(
-                "any23.extraction.csv.comment",
-                DEFAULT_COMMENT_DELIMITER
-        );
-        return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
-    }
-
-    private static char getCharValueFromConfiguration(String property, String defaultValue) {
-        String delimiter = defaultConfiguration.getProperty(
-                property,
-                defaultValue
-        );
-        if (delimiter.length() != 1 || delimiter.equals("")) {
-            throw new RuntimeException(property + " value must be a single character");
-        }
-        return delimiter.charAt(0);
-    }
-
-    /**
-     * make sure the reader has correct delimiter and quotation set.
-     * Check first lines and make sure they have the same amount of columns and at least 2
-     *
-     * @param is input stream to be checked
-     * @param strategy strategy to be verified.
-     * @return
-     * @throws IOException
-     * @param is
-     */
-    private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
-        final int MIN_COLUMNS = 2;
-
-        is.mark(Integer.MAX_VALUE);
-        try {
-            final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
-            int linesToCheck = 5;
-            int headerColumnCount = -1;
-            while (linesToCheck > 0) {
-                String[] row;
-                row = parser.getLine();
-                if (row == null) {
-                    break;
-                }
-                if (row.length < MIN_COLUMNS) {
-                    return false;
-                }
-                if (headerColumnCount == -1) { // first row
-                    headerColumnCount = row.length;
-                } else { // make sure rows have the same number of columns or one more than the header
-                    if (row.length < headerColumnCount) {
-                        return false;
-                    } else if (row.length - 1 > headerColumnCount) {
-                        return false;
-                    }
-                }
-                linesToCheck--;
-            }
-            return true;
-        } finally {
-            is.reset();
-        }
-    }
-
-
-}

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/csvutils/src/test/resources/log4j.properties b/csvutils/src/test/resources/log4j.properties
deleted file mode 100644
index a7ad0af..0000000
--- a/csvutils/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-log4j.rootCategory=INFO, R, O  
-      
-# Stdout  
-log4j.appender.O=org.apache.log4j.ConsoleAppender  
-      
-# File  
-#log4j.appender.R=org.apache.log4j.RollingFileAppender  
-#log4j.appender.R.File=log4j.log  
-      
-# Control the maximum log file size  
-#log4j.appender.R.MaxFileSize=100KB  
-      
-# Archive log files (one backup file here)  
-log4j.appender.R.MaxBackupIndex=1  
-      
-log4j.appender.R.layout=org.apache.log4j.PatternLayout  
-log4j.appender.O.layout=org.apache.log4j.PatternLayout  
-      
-log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  
-log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/mime/pom.xml
----------------------------------------------------------------------
diff --git a/mime/pom.xml b/mime/pom.xml
index 9db7d3b..2014758 100644
--- a/mime/pom.xml
+++ b/mime/pom.xml
@@ -38,7 +38,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-csvutils</artifactId>
+      <artifactId>apache-any23-utils</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
----------------------------------------------------------------------
diff --git a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
index e0584a1..77955cb 100644
--- a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
+++ b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
@@ -36,6 +36,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.regex.Pattern;
+import org.apache.any23.extractor.yaml.YAMLValidator;
 
 /**
  * Implementation of {@link MIMETypeDetector} based on
@@ -134,6 +135,17 @@ public class TikaMIMETypeDetector implements MIMETypeDetector {
     }
 
     /**
+     * Checks if the stream contains a valid <i>YAML</i> content.
+     *
+     * @param is
+     * @return
+     * @throws IOException
+     */
+    public static boolean checkYAMLFormat(InputStream is) throws IOException {
+        return YAMLValidator.isYAML(is);
+    }
+
+    /**
      * Tries to apply one of the given patterns on a sample of the input stream.
      *
      * @param patterns the patterns to apply.
@@ -263,8 +275,9 @@ public class TikaMIMETypeDetector implements MIMETypeDetector {
                     type = RDFFormat.TURTLE.getDefaultMIMEType();
                 } else if( checkCSVFormat(input) ) {
                     type = CSV_MIMETYPE;
-                }
-                else {
+                } else if (checkYAMLFormat(input)) { // YAML detection must be at the end
+                    type = "text/x-yaml";
+                } else {
                     type = MimeTypes.OCTET_STREAM; 
                 }
             }

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 23ab57f..ac2a9bd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -199,7 +199,7 @@
   <modules>
     <module>api</module>
     <module>test-resources</module>
-    <module>csvutils</module>
+    <module>utils</module>
     <module>mime</module>
     <module>encoding</module>
     <module>core</module>
@@ -527,6 +527,11 @@
         <artifactId>metainf-services</artifactId>
         <version>1.5</version>
       </dependency>
+      <dependency>
+        <groupId>org.yaml</groupId>
+        <artifactId>snakeyaml</artifactId>
+        <version>1.17</version>
+      </dependency>
       <!-- END: plugins -->
 
       <!-- BEGIN: Test Dependencies -->

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/pom.xml
----------------------------------------------------------------------
diff --git a/utils/pom.xml b/utils/pom.xml
new file mode 100644
index 0000000..a6f34ec
--- /dev/null
+++ b/utils/pom.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <artifactId>apache-any23</artifactId>
+    <groupId>org.apache.any23</groupId>
+    <version>2.1-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+
+  <artifactId>apache-any23-utils</artifactId>
+
+  <name>Apache Any23 :: Utilities</name>
+  <description>Utilities library</description>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-api</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.yaml</groupId>
+      <artifactId>snakeyaml</artifactId>
+    </dependency>
+    <!-- Logging -->
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <version>${slf4j.logger.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <!-- Testing -->
+    <dependency>
+       <groupId>junit</groupId>
+       <artifactId>junit</artifactId>
+       <scope>test</scope>
+    </dependency>
+    <dependency>
+       <groupId>${project.groupId}</groupId>
+       <artifactId>apache-any23-test-resources</artifactId>
+       <version>${project.version}</version>
+       <type>test-jar</type>
+       <scope>test</scope>
+  </dependency>
+  </dependencies>
+
+  <build>
+    <resources>
+      <resource>
+        <directory>${basedir}/../</directory>
+        <targetPath>META-INF</targetPath>
+        <includes>
+          <include>LICENSE.txt</include>
+          <include>NOTICE.txt</include>
+        </includes>
+      </resource>
+    </resources>
+    <pluginManagement>
+      <plugins>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-assembly-plugin</artifactId>
+          <version>${maven-assembly-plugin.version}</version>
+          <executions>
+            <execution>
+              <id>assembly</id>
+              <phase>package</phase>
+              <goals>
+                <goal>single</goal>
+              </goals>
+            </execution>
+          </executions>
+          <configuration>
+            <attach>true</attach>
+            <skipAssembly>true</skipAssembly>
+            <tarLongFileMode>gnu</tarLongFileMode>
+          </configuration>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+
+  <profiles>
+    <profile>
+      <id>release</id>
+      <build>
+        <resources>
+          <resource>
+            <directory>${basedir}/../</directory>
+            <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+            <includes>
+              <include>LICENSE.txt</include>
+              <include>NOTICE.txt</include>
+            </includes>
+          </resource>
+        </resources>
+      </build>
+    </profile>
+  </profiles>
+
+</project>

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
new file mode 100644
index 0000000..75bb583
--- /dev/null
+++ b/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.csv;
+
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVStrategy;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+/**
+ * This class is responsible to build a reader first guessing the configuration
+ * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
+ *
+ * @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @author Michele Mostarda ( michele.mostarda@gmail.com )
+ */
+public class CSVReaderBuilder {
+
+    private static final String DEFAULT_FIELD_DELIMITER = ",";
+
+    private static final String DEFAULT_COMMENT_DELIMITER = "#";
+
+    public static final char NULL_CHAR = ' ';
+
+    private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
+
+    private static DefaultConfiguration defaultConfiguration =
+            DefaultConfiguration.singleton();
+
+    private static final CSVStrategy[] strategies;
+
+    static {
+        strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
+        strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
+        int index = 1;
+        for(char dlmt : popularDelimiters) {
+            strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
+        }
+    }
+
+    /**
+     * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
+     * from the provided <i>CSV</i> file.
+     *
+     * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
+     * @return a {@link CSVParser}
+     * @throws java.io.IOException
+     */
+    public static CSVParser build(InputStream is) throws IOException {
+        CSVStrategy bestStrategy = getBestStrategy(is);
+        if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
+        return new CSVParser( new InputStreamReader(is), bestStrategy );
+    }
+
+    /**
+     * Checks whether the given input stream is a CSV or not.
+     *
+     * @param is input stream to be verified.
+     * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
+     *         <code>false</code> otherwise.
+     * @throws IOException
+     */
+    public static boolean isCSV(InputStream is) throws IOException {
+        return getBestStrategy(is) != null;
+    }
+
+    private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
+        for( CSVStrategy strategy : strategies ) {
+            if( testStrategy(is, strategy) ) {
+                return strategy;
+            }
+        }
+        return null;
+    }
+
+    private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
+        return new CSVStrategy(delimiter, '\'', comment);
+    }
+
+    private static CSVStrategy getCSVStrategyFromConfiguration() {
+        char fieldDelimiter = getCharValueFromConfiguration(
+                "any23.extraction.csv.field",
+                DEFAULT_FIELD_DELIMITER
+        );
+        char commentDelimiter = getCharValueFromConfiguration(
+                "any23.extraction.csv.comment",
+                DEFAULT_COMMENT_DELIMITER
+        );
+        return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
+    }
+
+    private static char getCharValueFromConfiguration(String property, String defaultValue) {
+        String delimiter = defaultConfiguration.getProperty(
+                property,
+                defaultValue
+        );
+        if (delimiter.length() != 1 || delimiter.equals("")) {
+            throw new RuntimeException(property + " value must be a single character");
+        }
+        return delimiter.charAt(0);
+    }
+
+    /**
+     * make sure the reader has correct delimiter and quotation set.
+     * Check first lines and make sure they have the same amount of columns and at least 2
+     *
+     * @param is input stream to be checked
+     * @param strategy strategy to be verified.
+     * @return
+     * @throws IOException
+     * @param is
+     */
+    private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
+        final int MIN_COLUMNS = 2;
+
+        is.mark(Integer.MAX_VALUE);
+        try {
+            final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
+            int linesToCheck = 5;
+            int headerColumnCount = -1;
+            while (linesToCheck > 0) {
+                String[] row;
+                row = parser.getLine();
+                if (row == null) {
+                    break;
+                }
+                if (row.length < MIN_COLUMNS) {
+                    return false;
+                }
+                if (headerColumnCount == -1) { // first row
+                    headerColumnCount = row.length;
+                } else { // make sure rows have the same number of columns or one more than the header
+                    if (row.length < headerColumnCount) {
+                        return false;
+                    } else if (row.length - 1 > headerColumnCount) {
+                        return false;
+                    }
+                }
+                linesToCheck--;
+            }
+            return true;
+        } finally {
+            is.reset();
+        }
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
----------------------------------------------------------------------
diff --git a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java b/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
new file mode 100644
index 0000000..5a5f63d
--- /dev/null
+++ b/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.yaml;
+
+import com.google.common.collect.Iterables;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Scanner;
+import java.util.regex.Pattern;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.yaml.snakeyaml.Yaml;
+
+/**
+ * Utility class provides static methods for YAML validation.
+ *
+ * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com)
+ */
+public class YAMLValidator {
+
+    private static final Logger log = LoggerFactory.getLogger(YAMLValidator.class);
+
+    private static final Pattern YAML_PATTERN = Pattern.compile("^%YAML.*", Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Detects if is contains valid YAML content.
+     * <p>
+     * In the first instance it checks if there is "%YAML" head. If not check
+     * using the brute force method by parsing input stream with yaml parser.
+     * </p>
+     * <p>
+     * NB. Only "false" results are trusted. Even if result is "true" you cannot
+     * be sure that InputStream contains YAML intentional context because
+     * comma-separated-values are pars-able by YAML parser as well.
+     * </p>
+     *
+     * @param is {@link InputStream}
+     * @return
+     * @throws IOException
+     */
+    public static boolean isYAML(InputStream is) throws IOException {
+        if (is == null) {
+            return false;
+        }
+
+        if (!is.markSupported()) {
+            is = new BufferedInputStream(is);
+        }
+
+        boolean result = false;
+
+        // mark the reading frame position. MUST BE FIRST
+        is.mark(Integer.MAX_VALUE);
+
+        while (true) {
+            // if is is empty than return false
+            if (is.available() <= 0) {
+                break;
+            }
+
+            Scanner sc = new Scanner(is);
+            String out = sc.findWithinHorizon(YAML_PATTERN, 0);
+
+            if (out != null && !out.isEmpty()) {
+                log.debug("Head: {}", out);
+                result = true;
+                break;
+            }
+            log.debug("Still not found. output is: {}", out);
+            is.reset();
+
+            try {
+                Yaml yml = new Yaml();
+                Iterable<Object> parsedOut = yml.loadAll(is);
+
+                if (Iterables.size(parsedOut) > 0) {
+                    result = true;
+                    break;
+                }
+            } catch (Exception ex) {
+                //do nothing
+            }
+
+            // final break 
+            break;
+        }
+
+        is.reset(); // MUST BE AT THE END
+        return result;
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
----------------------------------------------------------------------
diff --git a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java b/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
new file mode 100644
index 0000000..fddf2fb
--- /dev/null
+++ b/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.yaml.utils;
+
+import org.apache.any23.extractor.yaml.YAMLValidator;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author Jacek Grzebyta ( grzebyta.dev [at] gmail.com)
+ */
+@RunWith(Parameterized.class)
+public class YAMLValidatorTest {
+
+    private String path;
+
+    private Boolean expected;
+
+    private Logger log = LoggerFactory.getLogger(getClass());
+
+    public YAMLValidatorTest(String path, Boolean expected) {
+        this.path = path;
+        this.expected = expected;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> getFiles() {
+        return Arrays.asList(new Object[][]{
+            {"/org/apache/any23/extractor/yaml/simple-load.yml", Boolean.TRUE},
+            {"/org/apache/any23/extractor/yaml/simple-load_no_head.yml", Boolean.TRUE},
+            {"/org/apache/any23/extractor/yaml/different-integers.yml", Boolean.TRUE},
+            {"/org/apache/any23/extractor/yaml/different-float.yml", Boolean.TRUE},
+            {"/org/apache/any23/extractor/csv/test-comma.csv", Boolean.TRUE}});
+    }
+
+    @Test
+    public void runTest()
+            throws Exception {
+        log.info("Try path: {}", path);
+        InputStream is = YAMLValidatorTest.class.getResourceAsStream(path);
+        boolean result = YAMLValidator.isYAML(is);
+        log.debug("Test resutl: {}", result);
+        Assert.assertSame(expected, result);
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/utils/src/test/resources/log4j.properties b/utils/src/test/resources/log4j.properties
new file mode 100644
index 0000000..3860396
--- /dev/null
+++ b/utils/src/test/resources/log4j.properties
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# log4j.rootCategory=INFO, R, O  
+log4j.rootCategory=INFO, O  
+      
+# Stdout  
+log4j.appender.O=org.apache.log4j.ConsoleAppender  
+      
+# File  
+#log4j.appender.R=org.apache.log4j.RollingFileAppender  
+#log4j.appender.R.File=log4j.log  
+      
+# Control the maximum log file size  
+#log4j.appender.R.MaxFileSize=100KB  
+      
+# Archive log files (one backup file here)  
+log4j.appender.R.MaxBackupIndex=1  
+      
+log4j.appender.R.layout=org.apache.log4j.PatternLayout  
+log4j.appender.O.layout=org.apache.log4j.PatternLayout  
+      
+log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  
+log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  


[4/6] any23 git commit: Ref ANY23-308

Posted by an...@apache.org.
Ref ANY23-308

- restore csvutils
- detect yaml based on the file name
- remove utils module

Signed-off-by:Jacek Grzebyta <gr...@gmail.com>

Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/9839e212
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/9839e212
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/9839e212

Branch: refs/heads/master
Commit: 9839e212ec265237021219c0fff120929906e7bf
Parents: 0b82fdc
Author: Jacek Grzebyta <gr...@gmail.com>
Authored: Wed Jul 12 18:52:43 2017 +0100
Committer: Jacek Grzebyta <gr...@gmail.com>
Committed: Wed Jul 12 18:52:43 2017 +0100

----------------------------------------------------------------------
 cli/src/test/resources/log4j.properties         |   2 +
 .../any23/extractor/yaml/YAMLExtractorTest.java |  13 --
 .../extractor/yaml/YAMLTikaParserTest.java      |  30 +++-
 csvutils/pom.xml                                | 106 ++++++++++++
 .../any23/extractor/csv/CSVReaderBuilder.java   | 166 +++++++++++++++++++
 csvutils/src/test/resources/log4j.properties    |  34 ++++
 mime/pom.xml                                    |   2 +-
 .../apache/any23/mime/TikaMIMETypeDetector.java |  14 --
 pom.xml                                         |   2 +-
 utils/pom.xml                                   | 123 --------------
 .../any23/extractor/csv/CSVReaderBuilder.java   | 166 -------------------
 .../any23/extractor/yaml/YAMLValidator.java     | 105 ------------
 .../any23/yaml/utils/YAMLValidatorTest.java     |  66 --------
 utils/src/test/resources/log4j.properties       |  35 ----
 14 files changed, 336 insertions(+), 528 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/cli/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/cli/src/test/resources/log4j.properties b/cli/src/test/resources/log4j.properties
index 1918a4b..f286f6a 100644
--- a/cli/src/test/resources/log4j.properties
+++ b/cli/src/test/resources/log4j.properties
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+log4j.logger.org.apache.any23.cli.YAMLRoverTest=trace
+
 log4j.rootLogger=INFO, A1
 
 # A1 is set to be a ConsoleAppender.

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
index b265c5f..f2c85ba 100644
--- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
@@ -92,17 +92,4 @@ public class YAMLExtractorTest extends AbstractExtractorTestCase {
         RepositoryResult<Statement> docs = getStatements(null, null, RDF.NIL);
         Assert.assertTrue(Iterations.asList(docs).size() == 2);
     }
-
-    /**
-     * Comma separated values are parsed as well.
-     *
-     * @throws Exception
-     */
-    @Test
-    public void csvTest()
-            throws Exception {
-        assertExtract("/org/apache/any23/extractor/csv/test-comma.csv");
-        log.debug(dumpModelToTurtle());
-        assertModelNotEmpty();
-    }
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
index 4727c84..680b9fa 100644
--- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
@@ -15,11 +15,13 @@
  */
 package org.apache.any23.extractor.yaml;
 
+import com.mchange.util.AssertException;
 import java.io.InputStream;
 import org.apache.any23.mime.MIMEType;
 import org.apache.any23.mime.TikaMIMETypeDetector;
 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
 import org.junit.Assert;
+import org.junit.Before;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -32,16 +34,36 @@ public class YAMLTikaParserTest {
     private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml";
 
     private final Logger log = LoggerFactory.getLogger(getClass());
+    
+    private TikaMIMETypeDetector detector;
+    
+    @Before
+    public void prepareDetector() throws Exception {
+        detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
+    }
 
-    @Test
-    public void tikaDetect()
+    /**
+     * Yaml type is detected by file name only so detector returns octet type.
+     * @throws Exception 
+     */
+    @Test(expected = AssertionError.class)
+    public void tikaStreamDetect()
             throws Exception {
         InputStream is = YAMLTikaParserTest.class.getResourceAsStream(file1);
-        TikaMIMETypeDetector detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
         MIMEType type = detector.guessMIMEType(null, is, null);
 
         log.info("Type: {}", type.toString());
-
+        Assert.assertEquals("text/x-yaml", type.toString());
+    }
+    
+    @Test
+    public void tikaNameDetect() throws Exception {
+        String fileName = java.net.URI.create(file1).getPath();
+        
+        log.debug("normatised file name: {}", fileName);
+        MIMEType type = detector.guessMIMEType(fileName, null, null);
+        
+        log.info("Type: {}", type.toString());
         Assert.assertEquals("text/x-yaml", type.toString());
     }
 

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/csvutils/pom.xml
----------------------------------------------------------------------
diff --git a/csvutils/pom.xml b/csvutils/pom.xml
new file mode 100644
index 0000000..8f5b18d
--- /dev/null
+++ b/csvutils/pom.xml
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <artifactId>apache-any23</artifactId>
+    <groupId>org.apache.any23</groupId>
+    <version>2.1-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+
+  <artifactId>apache-any23-csvutils</artifactId>
+
+  <name>Apache Any23 :: CSV Utilities</name>
+  <description>CSV specific library.</description>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-api</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+    </dependency>
+    <!-- Logging -->
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <version>${slf4j.logger.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <resources>
+      <resource>
+        <directory>${basedir}/../</directory>
+        <targetPath>META-INF</targetPath>
+        <includes>
+          <include>LICENSE.txt</include>
+          <include>NOTICE.txt</include>
+        </includes>
+      </resource>
+    </resources>
+    <pluginManagement>
+      <plugins>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-assembly-plugin</artifactId>
+          <version>${maven-assembly-plugin.version}</version>
+          <executions>
+            <execution>
+              <id>assembly</id>
+              <phase>package</phase>
+              <goals>
+                <goal>single</goal>
+              </goals>
+            </execution>
+          </executions>
+          <configuration>
+            <attach>true</attach>
+            <skipAssembly>true</skipAssembly>
+            <tarLongFileMode>gnu</tarLongFileMode>
+          </configuration>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+
+  <profiles>
+    <profile>
+      <id>release</id>
+      <build>
+        <resources>
+          <resource>
+            <directory>${basedir}/../</directory>
+            <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+            <includes>
+              <include>LICENSE.txt</include>
+              <include>NOTICE.txt</include>
+            </includes>
+          </resource>
+        </resources>
+      </build>
+    </profile>
+  </profiles>
+
+</project>

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
new file mode 100644
index 0000000..75bb583
--- /dev/null
+++ b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.csv;
+
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVStrategy;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+/**
+ * This class is responsible to build a reader first guessing the configuration
+ * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
+ *
+ * @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @author Michele Mostarda ( michele.mostarda@gmail.com )
+ */
+public class CSVReaderBuilder {
+
+    private static final String DEFAULT_FIELD_DELIMITER = ",";
+
+    private static final String DEFAULT_COMMENT_DELIMITER = "#";
+
+    public static final char NULL_CHAR = ' ';
+
+    private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
+
+    private static DefaultConfiguration defaultConfiguration =
+            DefaultConfiguration.singleton();
+
+    private static final CSVStrategy[] strategies;
+
+    static {
+        strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
+        strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
+        int index = 1;
+        for(char dlmt : popularDelimiters) {
+            strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
+        }
+    }
+
+    /**
+     * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
+     * from the provided <i>CSV</i> file.
+     *
+     * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
+     * @return a {@link CSVParser}
+     * @throws java.io.IOException
+     */
+    public static CSVParser build(InputStream is) throws IOException {
+        CSVStrategy bestStrategy = getBestStrategy(is);
+        if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
+        return new CSVParser( new InputStreamReader(is), bestStrategy );
+    }
+
+    /**
+     * Checks whether the given input stream is a CSV or not.
+     *
+     * @param is input stream to be verified.
+     * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
+     *         <code>false</code> otherwise.
+     * @throws IOException
+     */
+    public static boolean isCSV(InputStream is) throws IOException {
+        return getBestStrategy(is) != null;
+    }
+
+    private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
+        for( CSVStrategy strategy : strategies ) {
+            if( testStrategy(is, strategy) ) {
+                return strategy;
+            }
+        }
+        return null;
+    }
+
+    private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
+        return new CSVStrategy(delimiter, '\'', comment);
+    }
+
+    private static CSVStrategy getCSVStrategyFromConfiguration() {
+        char fieldDelimiter = getCharValueFromConfiguration(
+                "any23.extraction.csv.field",
+                DEFAULT_FIELD_DELIMITER
+        );
+        char commentDelimiter = getCharValueFromConfiguration(
+                "any23.extraction.csv.comment",
+                DEFAULT_COMMENT_DELIMITER
+        );
+        return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
+    }
+
+    private static char getCharValueFromConfiguration(String property, String defaultValue) {
+        String delimiter = defaultConfiguration.getProperty(
+                property,
+                defaultValue
+        );
+        if (delimiter.length() != 1 || delimiter.equals("")) {
+            throw new RuntimeException(property + " value must be a single character");
+        }
+        return delimiter.charAt(0);
+    }
+
+    /**
+     * make sure the reader has correct delimiter and quotation set.
+     * Check first lines and make sure they have the same amount of columns and at least 2
+     *
+     * @param is input stream to be checked
+     * @param strategy strategy to be verified.
+     * @return
+     * @throws IOException
+     * @param is
+     */
+    private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
+        final int MIN_COLUMNS = 2;
+
+        is.mark(Integer.MAX_VALUE);
+        try {
+            final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
+            int linesToCheck = 5;
+            int headerColumnCount = -1;
+            while (linesToCheck > 0) {
+                String[] row;
+                row = parser.getLine();
+                if (row == null) {
+                    break;
+                }
+                if (row.length < MIN_COLUMNS) {
+                    return false;
+                }
+                if (headerColumnCount == -1) { // first row
+                    headerColumnCount = row.length;
+                } else { // make sure rows have the same number of columns or one more than the header
+                    if (row.length < headerColumnCount) {
+                        return false;
+                    } else if (row.length - 1 > headerColumnCount) {
+                        return false;
+                    }
+                }
+                linesToCheck--;
+            }
+            return true;
+        } finally {
+            is.reset();
+        }
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/csvutils/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/csvutils/src/test/resources/log4j.properties b/csvutils/src/test/resources/log4j.properties
new file mode 100644
index 0000000..a7ad0af
--- /dev/null
+++ b/csvutils/src/test/resources/log4j.properties
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+log4j.rootCategory=INFO, R, O  
+      
+# Stdout  
+log4j.appender.O=org.apache.log4j.ConsoleAppender  
+      
+# File  
+#log4j.appender.R=org.apache.log4j.RollingFileAppender  
+#log4j.appender.R.File=log4j.log  
+      
+# Control the maximum log file size  
+#log4j.appender.R.MaxFileSize=100KB  
+      
+# Archive log files (one backup file here)  
+log4j.appender.R.MaxBackupIndex=1  
+      
+log4j.appender.R.layout=org.apache.log4j.PatternLayout  
+log4j.appender.O.layout=org.apache.log4j.PatternLayout  
+      
+log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  
+log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/mime/pom.xml
----------------------------------------------------------------------
diff --git a/mime/pom.xml b/mime/pom.xml
index 2014758..9db7d3b 100644
--- a/mime/pom.xml
+++ b/mime/pom.xml
@@ -38,7 +38,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-utils</artifactId>
+      <artifactId>apache-any23-csvutils</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
----------------------------------------------------------------------
diff --git a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
index 77955cb..0c1f80b 100644
--- a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
+++ b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
@@ -36,7 +36,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.regex.Pattern;
-import org.apache.any23.extractor.yaml.YAMLValidator;
 
 /**
  * Implementation of {@link MIMETypeDetector} based on
@@ -135,17 +134,6 @@ public class TikaMIMETypeDetector implements MIMETypeDetector {
     }
 
     /**
-     * Checks if the stream contains a valid <i>YAML</i> content.
-     *
-     * @param is
-     * @return
-     * @throws IOException
-     */
-    public static boolean checkYAMLFormat(InputStream is) throws IOException {
-        return YAMLValidator.isYAML(is);
-    }
-
-    /**
      * Tries to apply one of the given patterns on a sample of the input stream.
      *
      * @param patterns the patterns to apply.
@@ -275,8 +263,6 @@ public class TikaMIMETypeDetector implements MIMETypeDetector {
                     type = RDFFormat.TURTLE.getDefaultMIMEType();
                 } else if( checkCSVFormat(input) ) {
                     type = CSV_MIMETYPE;
-                } else if (checkYAMLFormat(input)) { // YAML detection must be at the end
-                    type = "text/x-yaml";
                 } else {
                     type = MimeTypes.OCTET_STREAM; 
                 }

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 0bff3cf..dce4c53 100644
--- a/pom.xml
+++ b/pom.xml
@@ -199,7 +199,7 @@
   <modules>
     <module>api</module>
     <module>test-resources</module>
-    <module>utils</module>
+    <module>csvutils</module>
     <module>mime</module>
     <module>encoding</module>
     <module>core</module>

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/utils/pom.xml
----------------------------------------------------------------------
diff --git a/utils/pom.xml b/utils/pom.xml
deleted file mode 100644
index a6f34ec..0000000
--- a/utils/pom.xml
+++ /dev/null
@@ -1,123 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <artifactId>apache-any23</artifactId>
-    <groupId>org.apache.any23</groupId>
-    <version>2.1-SNAPSHOT</version>
-    <relativePath>..</relativePath>
-  </parent>
-
-  <artifactId>apache-any23-utils</artifactId>
-
-  <name>Apache Any23 :: Utilities</name>
-  <description>Utilities library</description>
-
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-api</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-csv</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.yaml</groupId>
-      <artifactId>snakeyaml</artifactId>
-    </dependency>
-    <!-- Logging -->
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-      <version>${slf4j.logger.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <!-- Testing -->
-    <dependency>
-       <groupId>junit</groupId>
-       <artifactId>junit</artifactId>
-       <scope>test</scope>
-    </dependency>
-    <dependency>
-       <groupId>${project.groupId}</groupId>
-       <artifactId>apache-any23-test-resources</artifactId>
-       <version>${project.version}</version>
-       <type>test-jar</type>
-       <scope>test</scope>
-  </dependency>
-  </dependencies>
-
-  <build>
-    <resources>
-      <resource>
-        <directory>${basedir}/../</directory>
-        <targetPath>META-INF</targetPath>
-        <includes>
-          <include>LICENSE.txt</include>
-          <include>NOTICE.txt</include>
-        </includes>
-      </resource>
-    </resources>
-    <pluginManagement>
-      <plugins>
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-          <artifactId>maven-assembly-plugin</artifactId>
-          <version>${maven-assembly-plugin.version}</version>
-          <executions>
-            <execution>
-              <id>assembly</id>
-              <phase>package</phase>
-              <goals>
-                <goal>single</goal>
-              </goals>
-            </execution>
-          </executions>
-          <configuration>
-            <attach>true</attach>
-            <skipAssembly>true</skipAssembly>
-            <tarLongFileMode>gnu</tarLongFileMode>
-          </configuration>
-        </plugin>
-      </plugins>
-    </pluginManagement>
-  </build>
-
-  <profiles>
-    <profile>
-      <id>release</id>
-      <build>
-        <resources>
-          <resource>
-            <directory>${basedir}/../</directory>
-            <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
-            <includes>
-              <include>LICENSE.txt</include>
-              <include>NOTICE.txt</include>
-            </includes>
-          </resource>
-        </resources>
-      </build>
-    </profile>
-  </profiles>
-
-</project>

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
deleted file mode 100644
index 75bb583..0000000
--- a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.any23.extractor.csv;
-
-import org.apache.any23.configuration.DefaultConfiguration;
-import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-
-/**
- * This class is responsible to build a reader first guessing the configuration
- * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
- *
- * @author Davide Palmisano ( dpalmisano@gmail.com )
- * @author Michele Mostarda ( michele.mostarda@gmail.com )
- */
-public class CSVReaderBuilder {
-
-    private static final String DEFAULT_FIELD_DELIMITER = ",";
-
-    private static final String DEFAULT_COMMENT_DELIMITER = "#";
-
-    public static final char NULL_CHAR = ' ';
-
-    private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
-
-    private static DefaultConfiguration defaultConfiguration =
-            DefaultConfiguration.singleton();
-
-    private static final CSVStrategy[] strategies;
-
-    static {
-        strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
-        strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
-        int index = 1;
-        for(char dlmt : popularDelimiters) {
-            strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
-        }
-    }
-
-    /**
-     * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
-     * from the provided <i>CSV</i> file.
-     *
-     * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
-     * @return a {@link CSVParser}
-     * @throws java.io.IOException
-     */
-    public static CSVParser build(InputStream is) throws IOException {
-        CSVStrategy bestStrategy = getBestStrategy(is);
-        if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
-        return new CSVParser( new InputStreamReader(is), bestStrategy );
-    }
-
-    /**
-     * Checks whether the given input stream is a CSV or not.
-     *
-     * @param is input stream to be verified.
-     * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
-     *         <code>false</code> otherwise.
-     * @throws IOException
-     */
-    public static boolean isCSV(InputStream is) throws IOException {
-        return getBestStrategy(is) != null;
-    }
-
-    private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
-        for( CSVStrategy strategy : strategies ) {
-            if( testStrategy(is, strategy) ) {
-                return strategy;
-            }
-        }
-        return null;
-    }
-
-    private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
-        return new CSVStrategy(delimiter, '\'', comment);
-    }
-
-    private static CSVStrategy getCSVStrategyFromConfiguration() {
-        char fieldDelimiter = getCharValueFromConfiguration(
-                "any23.extraction.csv.field",
-                DEFAULT_FIELD_DELIMITER
-        );
-        char commentDelimiter = getCharValueFromConfiguration(
-                "any23.extraction.csv.comment",
-                DEFAULT_COMMENT_DELIMITER
-        );
-        return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
-    }
-
-    private static char getCharValueFromConfiguration(String property, String defaultValue) {
-        String delimiter = defaultConfiguration.getProperty(
-                property,
-                defaultValue
-        );
-        if (delimiter.length() != 1 || delimiter.equals("")) {
-            throw new RuntimeException(property + " value must be a single character");
-        }
-        return delimiter.charAt(0);
-    }
-
-    /**
-     * make sure the reader has correct delimiter and quotation set.
-     * Check first lines and make sure they have the same amount of columns and at least 2
-     *
-     * @param is input stream to be checked
-     * @param strategy strategy to be verified.
-     * @return
-     * @throws IOException
-     * @param is
-     */
-    private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
-        final int MIN_COLUMNS = 2;
-
-        is.mark(Integer.MAX_VALUE);
-        try {
-            final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
-            int linesToCheck = 5;
-            int headerColumnCount = -1;
-            while (linesToCheck > 0) {
-                String[] row;
-                row = parser.getLine();
-                if (row == null) {
-                    break;
-                }
-                if (row.length < MIN_COLUMNS) {
-                    return false;
-                }
-                if (headerColumnCount == -1) { // first row
-                    headerColumnCount = row.length;
-                } else { // make sure rows have the same number of columns or one more than the header
-                    if (row.length < headerColumnCount) {
-                        return false;
-                    } else if (row.length - 1 > headerColumnCount) {
-                        return false;
-                    }
-                }
-                linesToCheck--;
-            }
-            return true;
-        } finally {
-            is.reset();
-        }
-    }
-
-
-}

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
----------------------------------------------------------------------
diff --git a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java b/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
deleted file mode 100644
index 5a5f63d..0000000
--- a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2017 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.any23.extractor.yaml;
-
-import com.google.common.collect.Iterables;
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Scanner;
-import java.util.regex.Pattern;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.yaml.snakeyaml.Yaml;
-
-/**
- * Utility class provides static methods for YAML validation.
- *
- * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com)
- */
-public class YAMLValidator {
-
-    private static final Logger log = LoggerFactory.getLogger(YAMLValidator.class);
-
-    private static final Pattern YAML_PATTERN = Pattern.compile("^%YAML.*", Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Detects if is contains valid YAML content.
-     * <p>
-     * In the first instance it checks if there is "%YAML" head. If not check
-     * using the brute force method by parsing input stream with yaml parser.
-     * </p>
-     * <p>
-     * NB. Only "false" results are trusted. Even if result is "true" you cannot
-     * be sure that InputStream contains YAML intentional context because
-     * comma-separated-values are pars-able by YAML parser as well.
-     * </p>
-     *
-     * @param is {@link InputStream}
-     * @return
-     * @throws IOException
-     */
-    public static boolean isYAML(InputStream is) throws IOException {
-        if (is == null) {
-            return false;
-        }
-
-        if (!is.markSupported()) {
-            is = new BufferedInputStream(is);
-        }
-
-        boolean result = false;
-
-        // mark the reading frame position. MUST BE FIRST
-        is.mark(Integer.MAX_VALUE);
-
-        while (true) {
-            // if is is empty than return false
-            if (is.available() <= 0) {
-                break;
-            }
-
-            Scanner sc = new Scanner(is);
-            String out = sc.findWithinHorizon(YAML_PATTERN, 0);
-
-            if (out != null && !out.isEmpty()) {
-                log.debug("Head: {}", out);
-                result = true;
-                break;
-            }
-            log.debug("Still not found. output is: {}", out);
-            is.reset();
-
-            try {
-                Yaml yml = new Yaml();
-                Iterable<Object> parsedOut = yml.loadAll(is);
-
-                if (Iterables.size(parsedOut) > 0) {
-                    result = true;
-                    break;
-                }
-            } catch (Exception ex) {
-                //do nothing
-            }
-
-            // final break 
-            break;
-        }
-
-        is.reset(); // MUST BE AT THE END
-        return result;
-    }
-}

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
----------------------------------------------------------------------
diff --git a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java b/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
deleted file mode 100644
index fddf2fb..0000000
--- a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright 2017 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.any23.yaml.utils;
-
-import org.apache.any23.extractor.yaml.YAMLValidator;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collection;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author Jacek Grzebyta ( grzebyta.dev [at] gmail.com)
- */
-@RunWith(Parameterized.class)
-public class YAMLValidatorTest {
-
-    private String path;
-
-    private Boolean expected;
-
-    private Logger log = LoggerFactory.getLogger(getClass());
-
-    public YAMLValidatorTest(String path, Boolean expected) {
-        this.path = path;
-        this.expected = expected;
-    }
-
-    @Parameterized.Parameters
-    public static Collection<Object[]> getFiles() {
-        return Arrays.asList(new Object[][]{
-            {"/org/apache/any23/extractor/yaml/simple-load.yml", Boolean.TRUE},
-            {"/org/apache/any23/extractor/yaml/simple-load_no_head.yml", Boolean.TRUE},
-            {"/org/apache/any23/extractor/yaml/different-integers.yml", Boolean.TRUE},
-            {"/org/apache/any23/extractor/yaml/different-float.yml", Boolean.TRUE},
-            {"/org/apache/any23/extractor/csv/test-comma.csv", Boolean.TRUE}});
-    }
-
-    @Test
-    public void runTest()
-            throws Exception {
-        log.info("Try path: {}", path);
-        InputStream is = YAMLValidatorTest.class.getResourceAsStream(path);
-        boolean result = YAMLValidator.isYAML(is);
-        log.debug("Test resutl: {}", result);
-        Assert.assertSame(expected, result);
-
-    }
-}

http://git-wip-us.apache.org/repos/asf/any23/blob/9839e212/utils/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/utils/src/test/resources/log4j.properties b/utils/src/test/resources/log4j.properties
deleted file mode 100644
index 3860396..0000000
--- a/utils/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# log4j.rootCategory=INFO, R, O  
-log4j.rootCategory=INFO, O  
-      
-# Stdout  
-log4j.appender.O=org.apache.log4j.ConsoleAppender  
-      
-# File  
-#log4j.appender.R=org.apache.log4j.RollingFileAppender  
-#log4j.appender.R.File=log4j.log  
-      
-# Control the maximum log file size  
-#log4j.appender.R.MaxFileSize=100KB  
-      
-# Archive log files (one backup file here)  
-log4j.appender.R.MaxBackupIndex=1  
-      
-log4j.appender.R.layout=org.apache.log4j.PatternLayout  
-log4j.appender.O.layout=org.apache.log4j.PatternLayout  
-      
-log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  
-log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n  


[3/6] any23 git commit: Detection MIME based on the file URI rather than on the base namespace.

Posted by an...@apache.org.
Detection MIME based on the file URI rather than on the base namespace.

- file path add to meta
- add documentation to unit test

Signed-off-by: Jacek Grzebyta <gr...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0b82fdc2
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0b82fdc2
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0b82fdc2

Branch: refs/heads/master
Commit: 0b82fdc205512acfd2a04d04ec547b6e36581e3b
Parents: c4c75a0
Author: Jacek Grzebyta <gr...@gmail.com>
Authored: Wed Jul 12 17:06:50 2017 +0100
Committer: Jacek Grzebyta <gr...@gmail.com>
Committed: Wed Jul 12 17:33:35 2017 +0100

----------------------------------------------------------------------
 cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java       | 5 ++++-
 .../org/apache/any23/extractor/SingleDocumentExtraction.java    | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/0b82fdc2/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
index 17e8916..4600452 100644
--- a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
+++ b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
@@ -62,7 +62,10 @@ public class YAMLRoverTest extends ToolTestBase {
     }
 
     /**
-     *
+     * Asserts if file contains wanted string.
+     * 
+     * If logging level is <tt>trace</tt> than additionally displays file content.
+     * 
      * @param f
      * @param s Expected string in the file
      * @return

http://git-wip-us.apache.org/repos/asf/any23/blob/0b82fdc2/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index d88edf7..3498108 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -422,8 +422,9 @@ public class SingleDocumentExtraction {
             return;
         }
         ensureHasLocalCopy();
+        // detect MIME based on the real file IRI rather than based on given base namespace
         detectedMIMEType = detector.guessMIMEType(
-                java.net.URI.create(documentIRI.stringValue()).getPath(),
+                java.net.URI.create(in.getDocumentIRI()).getPath(),
                 localDocumentSource.openInputStream(),
                 MIMEType.parse(localDocumentSource.getContentType())
         );


[5/6] any23 git commit: Fix compile and test errors to merge in

Posted by an...@apache.org.
Fix compile and test errors to merge in

Signed-off-by: Peter Ansell <p_...@yahoo.com>


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/f03e7970
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/f03e7970
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/f03e7970

Branch: refs/heads/master
Commit: f03e7970bf68bb94aeaf391c780805b9fde15121
Parents: 9839e21
Author: Peter Ansell <p_...@yahoo.com>
Authored: Thu Jul 13 15:50:09 2017 +1000
Committer: Peter Ansell <p_...@yahoo.com>
Committed: Thu Jul 13 15:50:09 2017 +1000

----------------------------------------------------------------------
 README.md                                               |  2 +-
 cli/pom.xml                                             |  2 +-
 core/pom.xml                                            |  6 +++++-
 .../apache/any23/extractor/yaml/YAMLTikaParserTest.java | 12 +++++++-----
 4 files changed, 14 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/f03e7970/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index e895e4f..6a7f135 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Any23 documentation can be found on the [website](http://any23.apache.org)
 
  * [api](https://github.com/apache/any23/tree/master/api): Any23 library external API.
  * [core](https://github.com/apache/any23/tree/master/core): The library core codebase.
- * [utils](https://github.com/apache/any23/tree/master/utils): A CSV specific package
+ * [csvutils](https://github.com/apache/any23/tree/master/csvutils): A CSV specific package
  * [encoding](https://github.com/apache/any23/tree/master/encoding): Encoding detection library.
  * [mime](https://github.com/apache/any23/tree/master/mime): MIME Type detection library.
  * [nquads](https://github.com/apache/any23/tree/master/nquads): NQuads parsing and serialization library.

http://git-wip-us.apache.org/repos/asf/any23/blob/f03e7970/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
index 47b9c06..5acedfb 100644
--- a/cli/pom.xml
+++ b/cli/pom.xml
@@ -50,7 +50,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-utils</artifactId>
+      <artifactId>apache-any23-csvutils</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/f03e7970/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index a04571a..6216935 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -38,7 +38,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>apache-any23-utils</artifactId>
+      <artifactId>apache-any23-csvutils</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
@@ -78,6 +78,10 @@
       <groupId>com.beust</groupId>
       <artifactId>jcommander</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.yaml</groupId>
+      <artifactId>snakeyaml</artifactId>
+    </dependency>
 
     <!-- BEGIN: Tika -->
     <dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/f03e7970/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
index 680b9fa..ebfe513 100644
--- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
@@ -15,7 +15,7 @@
  */
 package org.apache.any23.extractor.yaml;
 
-import com.mchange.util.AssertException;
+import java.io.BufferedInputStream;
 import java.io.InputStream;
 import org.apache.any23.mime.MIMEType;
 import org.apache.any23.mime.TikaMIMETypeDetector;
@@ -31,7 +31,7 @@ import org.slf4j.LoggerFactory;
  */
 public class YAMLTikaParserTest {
 
-    private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml";
+    private final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml";
 
     private final Logger log = LoggerFactory.getLogger(getClass());
     
@@ -46,14 +46,16 @@ public class YAMLTikaParserTest {
      * Yaml type is detected by file name only so detector returns octet type.
      * @throws Exception 
      */
-    @Test(expected = AssertionError.class)
+    @Test
     public void tikaStreamDetect()
             throws Exception {
-        InputStream is = YAMLTikaParserTest.class.getResourceAsStream(file1);
+        InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(file1));
+        Assert.assertNotNull("Could not find test file: " + file1, is);
         MIMEType type = detector.guessMIMEType(null, is, null);
 
         log.info("Type: {}", type.toString());
-        Assert.assertEquals("text/x-yaml", type.toString());
+        // Not currently doing stream detection for YAML, so it returns the default, octet-stream
+        Assert.assertEquals("application/octet-stream", type.toString());
     }
     
     @Test


[2/6] any23 git commit: Merge branch 'master' into ANY23-308

Posted by an...@apache.org.
Merge branch 'master' into ANY23-308

- resolve conflicts:
	README.md

Signed-off-by:Jacek Grzebyta <gr...@gmail.com>

Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/c4c75a0c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/c4c75a0c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/c4c75a0c

Branch: refs/heads/master
Commit: c4c75a0c3bb09b190985ec6796f4cca34ed3f6fb
Parents: ae036a7 5bc7e46
Author: Jacek Grzebyta <gr...@gmail.com>
Authored: Tue Jul 11 12:11:56 2017 +0100
Committer: Jacek Grzebyta <gr...@gmail.com>
Committed: Tue Jul 11 12:11:56 2017 +0100

----------------------------------------------------------------------
 README.md                                       |  24 +--
 core/pom.xml                                    |  11 ++
 .../rdf/FunctionalSyntaxExtractor.java          |  53 +++++++
 .../rdf/FunctionalSyntaxExtractorFactory.java   |  59 +++++++
 .../rdf/ManchesterSyntaxExtractor.java          |  53 +++++++
 .../rdf/ManchesterSyntaxExtractorFactory.java   |  59 +++++++
 .../any23/extractor/rdf/RDFParserFactory.java   |  41 +++++
 .../org.apache.any23.extractor.ExtractorFactory |   2 +
 .../extractor/rdf/example-functionalsyntax.ofn  |   5 +
 .../extractor/rdf/example-manchestersyntax.omn  |   5 +
 .../rdf/FunctionalSyntaxExtractorTest.java      |  80 ++++++++++
 .../rdf/ManchesterSyntaxExtractorTest.java      |  80 ++++++++++
 .../java/org/apache/any23/plugin/PluginIT.java  |   3 +-
 pom.xml                                         | 157 ++++++++++++++-----
 service/pom.xml                                 |   5 +-
 .../owl-functional/example-functionalsyntax.ofn |   5 +
 .../owl-manchester/example-manchestersyntax.omn |   5 +
 17 files changed, 590 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/c4c75a0c/README.md
----------------------------------------------------------------------
diff --cc README.md
index 6c52061,6a7f135..e895e4f
--- a/README.md
+++ b/README.md
@@@ -13,18 -13,18 +13,18 @@@ Any23 documentation can be found on th
  
  # Distribution Content
  
-  * [api](https://github.com/lewismc/any23/tree/master/api): Any23 library external API.
-  * [core](https://github.com/lewismc/any23/tree/master/core): The library core codebase.
-  * [utils](https://github.com/lewismc/any23/tree/master/utils): An utilities package
-  * [encoding](https://github.com/lewismc/any23/tree/master/encoding): Encoding detection library.
-  * [mime](https://github.com/lewismc/any23/tree/master/mime): MIME Type detection library.
-  * [nquads](https://github.com/lewismc/any23/tree/master/nquads): NQuads parsing and serialization library.
-  * [plugins](https://github.com/lewismc/any23/tree/master/plugins): Library plugins codebase (read [plugins/README.txt](https://github.com/lewismc/any23/blob/master/plugins/README.md) for further details).
-  * [service](https://github.com/lewismc/any23/tree/master/service): The library HTTP service codebase.
-  * [src](https://github.com/lewismc/any23/tree/master/src): Packaging for Any23 artifacts.
-  * [test-resources](https://github.com/lewismc/any23/tree/master/test-resources): Material relating to Any23 JUnit test cases.
-  * [RELEASE-NOTES.txt](https://github.com/lewismc/any23/blob/master/RELEASE-NOTES.txt): File reporting main release notes for every version.
-  * [LICENSE.txt](https://github.com/lewismc/any23/blob/master/LICENSE.txt): Applicable project license.
+  * [api](https://github.com/apache/any23/tree/master/api): Any23 library external API.
+  * [core](https://github.com/apache/any23/tree/master/core): The library core codebase.
 - * [csvutils](https://github.com/apache/any23/tree/master/csvutils): A CSV specific package
++ * [utils](https://github.com/apache/any23/tree/master/utils): A CSV specific package
+  * [encoding](https://github.com/apache/any23/tree/master/encoding): Encoding detection library.
+  * [mime](https://github.com/apache/any23/tree/master/mime): MIME Type detection library.
+  * [nquads](https://github.com/apache/any23/tree/master/nquads): NQuads parsing and serialization library.
+  * [plugins](https://github.com/apache/any23/tree/master/plugins): Library plugins codebase (read [plugins/README.md](https://github.com/apache/any23/blob/master/plugins/README.md) for further details).
+  * [service](https://github.com/apache/any23/tree/master/service): The library HTTP service codebase.
+  * [src](https://github.com/apache/any23/tree/master/src): Packaging for Any23 artifacts.
+  * [test-resources](https://github.com/apache/any23/tree/master/test-resources): Material relating to Any23 JUnit test cases.
+  * [RELEASE-NOTES.txt](https://github.com/apache/any23/blob/master/RELEASE-NOTES.txt): File reporting main release notes for every version.
+  * [LICENSE.txt](https://github.com/apache/any23/blob/master/LICENSE.txt): Applicable project license.
   * README.md: This file.
  
  # Online Documentation

http://git-wip-us.apache.org/repos/asf/any23/blob/c4c75a0c/core/pom.xml
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/any23/blob/c4c75a0c/pom.xml
----------------------------------------------------------------------


[6/6] any23 git commit: Merge branch 'ANY23-308-pr'

Posted by an...@apache.org.
Merge branch 'ANY23-308-pr'

Signed-off-by: Peter Ansell <p_...@yahoo.com>


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/b0baa940
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/b0baa940
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/b0baa940

Branch: refs/heads/master
Commit: b0baa94073052429054cee5c9e4a407612f6e351
Parents: 5bc7e46 f03e797
Author: Peter Ansell <p_...@yahoo.com>
Authored: Thu Jul 13 15:50:45 2017 +1000
Committer: Peter Ansell <p_...@yahoo.com>
Committed: Thu Jul 13 15:50:54 2017 +1000

----------------------------------------------------------------------
 .../org/apache/any23/cli/YAMLRoverTest.java     | 79 ++++++++++++++++++++
 cli/src/test/resources/log4j.properties         |  2 +
 core/pom.xml                                    |  1 -
 .../extractor/SingleDocumentExtraction.java     |  3 +-
 .../any23/extractor/yaml/YAMLExtractor.java     |  7 +-
 .../any23/extractor/yaml/YAMLExtractorTest.java |  1 -
 .../extractor/yaml/YAMLTikaParserTest.java      | 72 ++++++++++++++++++
 .../apache/any23/mime/TikaMIMETypeDetector.java |  3 +-
 pom.xml                                         |  5 ++
 9 files changed, 163 insertions(+), 10 deletions(-)
----------------------------------------------------------------------