You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by an...@apache.org on 2017/01/13 22:25:59 UTC
[20/25] any23 git commit: ANY23-80 : Split out CLI into its own module
ANY23-80 : Split out CLI into its own module
Signed-off-by: Peter Ansell <p_...@yahoo.com>
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/242b130b
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/242b130b
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/242b130b
Branch: refs/heads/master
Commit: 242b130b4670507e240bf9fec1fb8f9aad647870
Parents: 82e5645
Author: Peter Ansell <p_...@yahoo.com>
Authored: Thu Jan 12 10:35:17 2017 +1100
Committer: Peter Ansell <p_...@yahoo.com>
Committed: Thu Jan 12 10:35:17 2017 +1100
----------------------------------------------------------------------
cli/pom.xml | 253 ++++++++++++++++++
.../any23/cli/ExtractorDocumentation.java | 186 +++++++++++++
.../org/apache/any23/cli/MicrodataParser.java | 99 +++++++
.../java/org/apache/any23/cli/MimeDetector.java | 101 +++++++
.../org/apache/any23/cli/PluginVerifier.java | 86 ++++++
.../main/java/org/apache/any23/cli/Rover.java | 265 +++++++++++++++++++
.../java/org/apache/any23/cli/ToolRunner.java | 263 ++++++++++++++++++
.../java/org/apache/any23/cli/VocabPrinter.java | 54 ++++
.../java/org/apache/any23/cli/package-info.java | 22 ++
.../any23/cli/ExtractorDocumentationTest.java | 57 ++++
.../apache/any23/cli/MicrodataParserTest.java | 46 ++++
.../org/apache/any23/cli/MimeDetectorTest.java | 51 ++++
.../apache/any23/cli/PluginVerifierTest.java | 38 +++
.../java/org/apache/any23/cli/RoverTest.java | 139 ++++++++++
.../org/apache/any23/cli/ToolRunnerTest.java | 65 +++++
.../java/org/apache/any23/cli/ToolTestBase.java | 91 +++++++
.../org/apache/any23/cli/VocabPrinterTest.java | 38 +++
.../any23/cli/ExtractorDocumentation.java | 186 -------------
.../org/apache/any23/cli/MicrodataParser.java | 99 -------
.../java/org/apache/any23/cli/MimeDetector.java | 101 -------
.../org/apache/any23/cli/PluginVerifier.java | 86 ------
.../main/java/org/apache/any23/cli/Rover.java | 265 -------------------
.../java/org/apache/any23/cli/ToolRunner.java | 263 ------------------
.../java/org/apache/any23/cli/VocabPrinter.java | 54 ----
.../java/org/apache/any23/cli/package-info.java | 22 --
.../any23/cli/ExtractorDocumentationTest.java | 57 ----
.../apache/any23/cli/MicrodataParserTest.java | 46 ----
.../org/apache/any23/cli/MimeDetectorTest.java | 51 ----
.../apache/any23/cli/PluginVerifierTest.java | 38 ---
.../java/org/apache/any23/cli/RoverTest.java | 139 ----------
.../org/apache/any23/cli/ToolRunnerTest.java | 65 -----
.../java/org/apache/any23/cli/ToolTestBase.java | 91 -------
.../org/apache/any23/cli/VocabPrinterTest.java | 38 ---
plugins/basic-crawler/pom.xml | 16 +-
plugins/html-scraper/pom.xml | 1 -
plugins/office-scraper/pom.xml | 1 -
pom.xml | 1 +
37 files changed, 1870 insertions(+), 1604 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
new file mode 100644
index 0000000..c01f3b7
--- /dev/null
+++ b/cli/pom.xml
@@ -0,0 +1,253 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ <relativePath>../</relativePath>
+ </parent>
+
+ <artifactId>apache-any23-cli</artifactId>
+
+ <name>Apache Any23 :: CLI</name>
+ <description>Command line interface.</description>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-api</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-csvutils</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-mime</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-encoding</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>net.sourceforge.nekohtml</groupId>
+ <artifactId>nekohtml</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.beust</groupId>
+ <artifactId>jcommander</artifactId>
+ </dependency>
+
+ <!-- BEGIN: Tika -->
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ </dependency>
+ <!-- END: Tika -->
+
+ <!-- BEGIN: Sesame -->
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-model</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-rio-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-rio-jsonld</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-rio-turtle</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-rio-rdfxml</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-rio-ntriples</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-rio-trix</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-repository-sail</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-sail-memory</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.rdf4j</groupId>
+ <artifactId>rdf4j-repository-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.semarglproject</groupId>
+ <artifactId>semargl-rdf4j</artifactId>
+ </dependency>
+ <!-- END: Sesame -->
+
+ <!-- BEGIN: Apache Commons, this version is hosted in the
+ any23-repository-external repository -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-csv</artifactId>
+ </dependency>
+ <!-- END: Apache Commons CSV -->
+
+ <!-- BEGIN: Test Dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ <!-- END: Test Dependencies -->
+ </dependencies>
+
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/src/main/resources</directory>
+ <filtering>true</filtering>
+ </resource>
+
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+
+ <plugins>
+ <!-- generates the bin launchers -->
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>appassembler-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>assemble</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <programs>
+ <program>
+ <mainClass>org.apache.any23.cli.ToolRunner</mainClass>
+ <name>any23</name>
+ </program>
+ </programs>
+ <configurationDirectory>conf</configurationDirectory>
+ <configurationSourceDirectory>${basedir}/src/test/resources</configurationSourceDirectory>
+ <copyConfigurationDirectory>true</copyConfigurationDirectory>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>release</id>
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </build>
+ </profile>
+ </profiles>
+
+</project>
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java
new file mode 100644
index 0000000..9a0410b
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import org.apache.any23.extractor.ExampleInputOutput;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.Extractor;
+import org.apache.any23.extractor.ExtractorRegistryImpl;
+import org.apache.any23.extractor.Extractor.BlindExtractor;
+import org.apache.any23.extractor.Extractor.ContentExtractor;
+import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.ExtractorRegistry;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * This class provides some command-line documentation
+ * about available extractors and their usage.
+ */
+@Parameters( commandNames = { "extractor" }, commandDescription= "Utility for obtaining documentation about metadata extractors.")
+public class ExtractorDocumentation implements Tool {
+
+ @Parameter( names = { "-l", "--list" }, description = "shows the names of all available extractors" )
+ private boolean showList;
+
+ @Parameter( names = { "-i", "--input" }, description = "shows example input for the given extractor" )
+ private boolean showInput;
+
+ @Parameter( names = { "-o", "--outut" }, description = "shows example output for the given extractor" )
+ private boolean showOutput;
+
+ @Parameter( names = { "-a", "--all" }, description = "shows a report about all available extractors" )
+ private boolean showAll;
+
+ @Parameter( arity = 1, description = "Extractor name" )
+ private List<String> extractor = new LinkedList<String>();
+
+ public void run() throws Exception {
+ if (showList) {
+ printExtractorList(ExtractorRegistryImpl.getInstance());
+ } else if (showInput) {
+ if (extractor.isEmpty()) {
+ throw new IllegalArgumentException("Required argument for -i: extractor name");
+ }
+
+ printExampleInput(extractor.get(0), ExtractorRegistryImpl.getInstance());
+ } else if (showOutput) {
+ if (extractor.isEmpty()) {
+ throw new IllegalArgumentException("Required argument for -o: extractor name");
+ }
+
+ printExampleOutput(extractor.get(0), ExtractorRegistryImpl.getInstance());
+ } else if (showAll) {
+ printReport(ExtractorRegistryImpl.getInstance());
+ }
+ }
+
+ /**
+ * Print an error message.
+ *
+ * @param msg the error message to be printed
+ */
+ public void printError(String msg) {
+ System.err.println(msg);
+ }
+
+ /**
+ * Prints the list of all the available extractors.
+ * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
+ * containing all extractors
+ */
+ public void printExtractorList(ExtractorRegistry registry) {
+ for (ExtractorFactory factory : registry.getExtractorGroup()) {
+ System.out.println( String.format("%25s [%15s]", factory.getExtractorName(), factory.getExtractorLabel()));
+ }
+ }
+
+ /**
+ * Prints an example of input for the provided extractor.
+ *
+ * @param extractorName the name of the extractor
+ * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
+ * containing all extractors
+ * @throws IOException raised if no extractor is found with that name
+ */
+ public void printExampleInput(String extractorName, ExtractorRegistry registry) throws IOException {
+ ExtractorFactory<?> factory = getFactory(registry, extractorName);
+ ExampleInputOutput example = new ExampleInputOutput(factory);
+ String input = example.getExampleInput();
+ if (input == null) {
+ throw new IllegalArgumentException("Extractor " + extractorName + " provides no example input");
+ }
+ System.out.println(input);
+ }
+
+ /**
+ * Prints an output example for the given extractor.
+ *
+ * @param extractorName the extractor name
+ * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
+ * containing all extractors
+ * @throws IOException raised if no extractor is found with that name
+ * @throws ExtractionException if there is an error duing extraction
+ */
+ public void printExampleOutput(String extractorName, ExtractorRegistry registry) throws IOException, ExtractionException {
+ ExtractorFactory<?> factory = getFactory(registry, extractorName);
+ ExampleInputOutput example = new ExampleInputOutput(factory);
+ String output = example.getExampleOutput();
+ if (output == null) {
+ throw new IllegalArgumentException("Extractor " + extractorName + " provides no example output");
+ }
+ System.out.println(output);
+ }
+
+ /**
+ * Prints a complete report on all the available extractors.
+ *
+ * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
+ * containing all extractors
+ * @throws IOException raised if no extractor is found with that name
+ * @throws ExtractionException if there is an error duing extraction
+ */
+ public void printReport(ExtractorRegistry registry) throws IOException, ExtractionException {
+ for (String extractorName : registry.getAllNames()) {
+ ExtractorFactory<?> factory = registry.getFactory(extractorName);
+ ExampleInputOutput example = new ExampleInputOutput(factory);
+ System.out.println("Extractor: " + extractorName);
+ System.out.println("\ttype: " + getType(factory));
+ System.out.println();
+ final String exampleInput = example.getExampleInput();
+ if(exampleInput == null) {
+ System.out.println("(No Example Available)");
+ } else {
+ System.out.println("-------- Example Input --------");
+ System.out.println(exampleInput);
+ System.out.println("-------- Example Output --------");
+ String output = example.getExampleOutput();
+ System.out.println(output == null || output.trim().length() == 0 ? "(No Output Generated)" : output);
+ }
+ System.out.println("================================");
+ System.out.println();
+ }
+ }
+
+ private ExtractorFactory<?> getFactory(ExtractorRegistry registry, String name) {
+ if (!registry.isRegisteredName(name)) {
+ throw new IllegalArgumentException("Unknown extractor name: " + name);
+ }
+ return registry.getFactory(name);
+ }
+
+ private String getType(ExtractorFactory<?> factory) {
+ Extractor<?> extractor = factory.createExtractor();
+ if (extractor instanceof BlindExtractor) {
+ return BlindExtractor.class.getSimpleName();
+ }
+ if (extractor instanceof TagSoupDOMExtractor) {
+ return TagSoupDOMExtractor.class.getSimpleName();
+ }
+ if (extractor instanceof ContentExtractor) {
+ return ContentExtractor.class.getSimpleName();
+ }
+ return "?";
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java
new file mode 100644
index 0000000..19c59bf
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/MicrodataParser.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import com.beust.jcommander.IStringConverter;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.ParameterException;
+import com.beust.jcommander.Parameters;
+import org.apache.any23.extractor.html.TagSoupParser;
+import org.apache.any23.http.DefaultHTTPClient;
+import org.apache.any23.source.DocumentSource;
+import org.apache.any23.source.FileDocumentSource;
+import org.apache.any23.source.HTTPDocumentSource;
+import org.apache.any23.util.StreamUtils;
+
+import java.io.File;
+import java.io.InputStream;
+import java.net.URISyntaxException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Command line <i>Microdata</i> parser, accepting both files and URLs and
+ * returing a <i>JSON</i> representation of the extracted metadata as described at
+ * <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+@Parameters( commandNames = { "microdata" }, commandDescription = "Commandline Tool for extracting Microdata from file/HTTP source.")
+public class MicrodataParser implements Tool {
+
+ private static final Pattern HTTP_DOCUMENT_PATTERN = Pattern.compile("^https?://.*");
+
+ private static final Pattern FILE_DOCUMENT_PATTERN = Pattern.compile("^file:(.*)$");
+
+ @Parameter(
+ arity = 1,
+ description = "Input document URL, {http://path/to/resource.html|file:/path/to/localFile.html}",
+ converter = MicrodataParserDocumentSourceConverter.class
+ )
+ private List<DocumentSource> document = new LinkedList<DocumentSource>();
+
+ public void run() throws Exception {
+ if (document.isEmpty()) {
+ throw new IllegalArgumentException("No input document URL specified");
+ }
+ InputStream documentInputInputStream = null;
+ try {
+ final DocumentSource documentSource = document.get(0);
+ documentInputInputStream = documentSource.openInputStream();
+ final TagSoupParser tagSoupParser = new TagSoupParser(
+ documentInputInputStream,
+ documentSource.getDocumentIRI()
+ );
+ org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out);
+ } finally {
+ if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
+ }
+ }
+
+ public static final class MicrodataParserDocumentSourceConverter implements IStringConverter<DocumentSource> {
+
+ @Override
+ public DocumentSource convert( String value ) {
+ final Matcher httpMatcher = HTTP_DOCUMENT_PATTERN.matcher(value);
+ if (httpMatcher.find()) {
+ try {
+ return new HTTPDocumentSource(DefaultHTTPClient.createInitializedHTTPClient(), value);
+ } catch ( URISyntaxException e ) {
+ throw new ParameterException("Invalid source IRI: '" + value + "'");
+ }
+ }
+ final Matcher fileMatcher = FILE_DOCUMENT_PATTERN.matcher(value);
+ if (fileMatcher.find()) {
+ return new FileDocumentSource( new File( fileMatcher.group(1) ) );
+ }
+ throw new ParameterException("Invalid source protocol: '" + value + "'");
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/MimeDetector.java b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java
new file mode 100644
index 0000000..c9072cb
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/MimeDetector.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import com.beust.jcommander.IStringConverter;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import org.apache.any23.http.DefaultHTTPClient;
+import org.apache.any23.http.DefaultHTTPClientConfiguration;
+import org.apache.any23.http.HTTPClient;
+import org.apache.any23.mime.MIMEType;
+import org.apache.any23.mime.MIMETypeDetector;
+import org.apache.any23.mime.TikaMIMETypeDetector;
+import org.apache.any23.source.DocumentSource;
+import org.apache.any23.source.FileDocumentSource;
+import org.apache.any23.source.HTTPDocumentSource;
+import org.apache.any23.source.StringDocumentSource;
+
+import java.io.File;
+import java.net.URISyntaxException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Commandline tool to detect <b>MIME Type</b>s from
+ * file, HTTP and direct input sources.
+ * The implementation of this tool is based on {@link org.apache.any23.mime.TikaMIMETypeDetector}.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+@Parameters(commandNames = { "mimes" }, commandDescription = "MIME Type Detector Tool.")
+public class MimeDetector implements Tool{
+
+ public static final String FILE_DOCUMENT_PREFIX = "file://";
+
+ public static final String INLINE_DOCUMENT_PREFIX = "inline://";
+
+ public static final String URL_DOCUMENT_RE = "^https?://.*";
+
+ @Parameter(
+ arity = 1,
+ description = "Input document URL, {http://path/to/resource.html|file:///path/to/local.file|inline:// some inline content}",
+ converter = MimeDetectorDocumentSourceConverter.class
+ )
+ private List<DocumentSource> document = new LinkedList<DocumentSource>();
+
+ public void run() throws Exception {
+ if (document.isEmpty()) {
+ throw new IllegalArgumentException("No input document URL specified");
+ }
+
+ final DocumentSource documentSource = document.get(0);
+ final MIMETypeDetector detector = new TikaMIMETypeDetector();
+ final MIMEType mimeType = detector.guessMIMEType(
+ documentSource.getDocumentIRI(),
+ documentSource.openInputStream(),
+ MIMEType.parse(documentSource.getContentType())
+ );
+ System.out.println(mimeType);
+ }
+
+ public static final class MimeDetectorDocumentSourceConverter implements IStringConverter<DocumentSource> {
+
+ @Override
+ public DocumentSource convert( String document ) {
+ if (document.startsWith(FILE_DOCUMENT_PREFIX)) {
+ return new FileDocumentSource( new File( document.substring(FILE_DOCUMENT_PREFIX.length()) ) );
+ }
+ if (document.startsWith(INLINE_DOCUMENT_PREFIX)) {
+ return new StringDocumentSource( document.substring(INLINE_DOCUMENT_PREFIX.length()), "" );
+ }
+ if (document.matches(URL_DOCUMENT_RE)) {
+ final HTTPClient client = new DefaultHTTPClient();
+ client.init( DefaultHTTPClientConfiguration.singleton() );
+ try {
+ return new HTTPDocumentSource(client, document);
+ } catch ( URISyntaxException e ) {
+ throw new IllegalArgumentException("Invalid source IRI: '" + document + "'");
+ }
+ }
+ throw new IllegalArgumentException("Unsupported protocol for document " + document);
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java
new file mode 100644
index 0000000..a747b49
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/PluginVerifier.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.beust.jcommander.converters.FileConverter;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.mime.MIMEType;
+import org.apache.any23.plugin.Any23PluginManager;
+import org.apache.any23.plugin.Author;
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Commandline utility to verify the <b>Any23</b> plugins
+ * and extract basic information.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+@Parameters(commandNames = { "verify" }, commandDescription = "Utility for plugin management verification.")
+public class PluginVerifier implements Tool {
+
+ private Any23PluginManager pluginManager = Any23PluginManager.getInstance();
+
+ @Parameter(
+ description = "plugins-dir",
+ converter = FileConverter.class
+ )
+ private List<File> pluginsDirs = new LinkedList<File>();
+
+ public void run() throws Exception {
+ if (pluginsDirs.isEmpty()) {
+ throw new IllegalArgumentException("No plugin directory specified.");
+ }
+
+ final File pluginsDir = pluginsDirs.get(0);
+ if (!pluginsDir.isDirectory()) {
+ throw new IllegalArgumentException("<plugins-dir> must be a valid dir.");
+ }
+
+ pluginManager.loadJARDir(pluginsDir);
+
+ final Iterator<ExtractorFactory> plugins = pluginManager.getExtractors();
+
+ while (plugins.hasNext()) {
+ printPluginData(plugins.next(), System.out);
+ System.out.println("------------------------------------------------------------------------");
+ }
+ }
+
+ private String getMimeTypesStr(Collection<MIMEType> mimeTypes) {
+ final StringBuilder sb = new StringBuilder();
+ for (MIMEType mt : mimeTypes) {
+ sb.append(mt).append(' ');
+ }
+ return sb.toString();
+ }
+
+ private void printPluginData(ExtractorFactory extractorFactory, PrintStream ps) {
+ final Author authorAnnotation = extractorFactory.getClass().getAnnotation(Author.class);
+ ps.printf("Plugin author : %s\n", authorAnnotation == null ? "<unknown>" : authorAnnotation.name());
+ ps.printf("Plugin factory : %s\n", extractorFactory.getClass());
+ ps.printf("Plugin mime-types: %s\n", getMimeTypesStr( extractorFactory.getSupportedMIMETypes() ));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/Rover.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/Rover.java b/cli/src/main/java/org/apache/any23/cli/Rover.java
new file mode 100644
index 0000000..26a8663
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/Rover.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import com.beust.jcommander.IStringConverter;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.ParameterException;
+import com.beust.jcommander.Parameters;
+import com.beust.jcommander.converters.FileConverter;
+import org.apache.any23.Any23;
+import org.apache.any23.configuration.Configuration;
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionParameters.ValidationMode;
+import org.apache.any23.filter.IgnoreAccidentalRDFa;
+import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
+import org.apache.any23.source.DocumentSource;
+import org.apache.any23.writer.BenchmarkTripleHandler;
+import org.apache.any23.writer.LoggingTripleHandler;
+import org.apache.any23.writer.ReportingTripleHandler;
+import org.apache.any23.writer.TripleHandler;
+import org.apache.any23.writer.TripleHandlerException;
+import org.apache.any23.writer.WriterFactoryRegistry;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.LinkedList;
+import java.util.List;
+
+import static java.lang.String.format;
+
+/**
+ * A default rover implementation. Goes and fetches a URL using an hint
+ * as to what format should require, then tries to convert it to RDF.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Richard Cyganiak (richard@cyganiak.de)
+ * @author Gabriele Renzi
+ */
+@Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.")
+public class Rover implements Tool {
+
+ private static final List<String> FORMATS = WriterFactoryRegistry.getInstance().getIdentifiers();
+
+ private static final int DEFAULT_FORMAT_INDEX = 0;
+
+ private static final Logger logger = LoggerFactory.getLogger(Rover.class);
+
+ @Parameter(
+ names = { "-o", "--output" },
+ description = "Specify Output file (defaults to standard output)",
+ converter = PrintStreamConverter.class
+ )
+ private PrintStream outputStream = System.out;
+
+ @Parameter(description = "input IRIs {<url>|<file>}+", converter = ArgumentToIRIConverter.class)
+ protected List<String> inputIRIs = new LinkedList<String>();
+
+ @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle")
+ private List<String> extractors = new LinkedList<String>();
+
+ @Parameter(names = { "-f", "--format" }, description = "the output format")
+ private String format = FORMATS.get(DEFAULT_FORMAT_INDEX);
+
+ @Parameter(
+ names = { "-l", "--log" },
+ description = "Produce log within a file.",
+ converter = FileConverter.class
+ )
+ private File logFile = null;
+
+ @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.")
+ private boolean statistics;
+
+ @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones).")
+ private boolean noTrivial;
+
+ @Parameter(names = { "-p", "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.")
+ private boolean pedantic;
+
+ @Parameter(names = { "-n", "--nesting" }, description = "Disable production of nesting triples.")
+ private boolean nestingDisabled;
+
+ @Parameter(names = { "-d", "--defaultns" }, description = "Override the default namespace used to produce statements.")
+ private String defaultns;
+
+ // non parameters
+
+ private TripleHandler tripleHandler;
+
+ private ReportingTripleHandler reportingTripleHandler;
+
+ private BenchmarkTripleHandler benchmarkTripleHandler;
+
+ private Any23 any23;
+
+ private ExtractionParameters extractionParameters;
+
+ protected void configure() {
+ try {
+ tripleHandler = WriterFactoryRegistry.getInstance().getWriterInstanceByIdentifier(format, outputStream);
+ } catch (Exception e) {
+ throw new NullPointerException(
+ format("Invalid output format '%s', admitted values: %s",
+ format,
+ FORMATS
+ )
+ );
+ }
+
+ if (logFile != null) {
+ try {
+ tripleHandler = new LoggingTripleHandler(tripleHandler, new PrintWriter(logFile));
+ } catch (FileNotFoundException fnfe) {
+ throw new IllegalArgumentException( format("Can not write to log file [%s]", logFile), fnfe );
+ }
+ }
+
+ if (statistics) {
+ benchmarkTripleHandler = new BenchmarkTripleHandler(tripleHandler);
+ tripleHandler = benchmarkTripleHandler;
+ }
+
+ if (noTrivial) {
+ tripleHandler = new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(tripleHandler),
+ true // suppress stylesheet triples.
+ );
+ }
+
+ reportingTripleHandler = new ReportingTripleHandler(tripleHandler);
+
+ final Configuration configuration = DefaultConfiguration.singleton();
+ extractionParameters =
+ pedantic
+ ?
+ new ExtractionParameters(configuration, ValidationMode.ValidateAndFix, nestingDisabled)
+ :
+ new ExtractionParameters(configuration, ValidationMode.None , nestingDisabled);
+ if (defaultns != null) {
+ extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY,
+ defaultns);
+ }
+
+ any23 = (extractors.isEmpty()) ? new Any23()
+ : new Any23(extractors.toArray(new String[extractors.size()]));
+ any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION);
+ }
+
+ protected String printReports() {
+ final StringBuilder sb = new StringBuilder();
+ if (benchmarkTripleHandler != null) sb.append( benchmarkTripleHandler.report() ).append('\n');
+ if (reportingTripleHandler != null) sb.append( reportingTripleHandler.printReport() ).append('\n');
+ return sb.toString();
+ }
+
+ protected void performExtraction(DocumentSource documentSource) throws Exception {
+ if (!any23.extract(extractionParameters, documentSource, tripleHandler).hasMatchingExtractors()) {
+ throw new IllegalStateException(format("No suitable extractors found for source %s", documentSource));
+ }
+ }
+
+ protected void close() {
+ if (tripleHandler != null) {
+ try {
+ tripleHandler.close();
+ } catch (TripleHandlerException the) {
+ throw new RuntimeException("Error while closing TripleHandler", the);
+ }
+ }
+
+ if (outputStream != null && outputStream != System.out) { // TODO: low - find better solution to avoid closing system out.
+ outputStream.close();
+ }
+ }
+
+ public void run() throws Exception {
+ if (inputIRIs.isEmpty()) {
+ throw new IllegalArgumentException("Expected at least 1 argument.");
+ }
+
+ configure();
+
+ // perform conversions
+
+ try {
+ final long start = System.currentTimeMillis();
+ for (String inputIRI : inputIRIs) {
+ DocumentSource source = any23.createDocumentSource(inputIRI);
+
+ performExtraction( source );
+ }
+ final long elapsed = System.currentTimeMillis() - start;
+
+ if (benchmarkTripleHandler != null) {
+ System.err.println(benchmarkTripleHandler.report());
+ }
+
+ logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames());
+ logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms");
+ } finally {
+ close();
+ }
+ }
+
+ public static final class ArgumentToIRIConverter implements IStringConverter<String> {
+
+ @Override
+ public String convert(String uri) {
+ uri = uri.trim();
+ if (uri.toLowerCase().startsWith("http:") || uri.toLowerCase().startsWith("https:")) {
+ try {
+ return new URL(uri).toString();
+ } catch (MalformedURLException murle) {
+ throw new ParameterException(format("Invalid IRI: '%s': %s", uri, murle.getMessage()));
+ }
+ }
+
+ final File f = new File(uri);
+ if (!f.exists()) {
+ throw new ParameterException(format("No such file: [%s]", f.getAbsolutePath()));
+ }
+ if (f.isDirectory()) {
+ throw new ParameterException(format("Found a directory: [%s]", f.getAbsolutePath()));
+ }
+ return f.toURI().toString();
+ }
+
+ }
+
+ public static final class PrintStreamConverter implements IStringConverter<PrintStream> {
+
+ @Override
+ public PrintStream convert( String value ) {
+ final File file = new File(value);
+ try {
+ return new PrintStream(file);
+ } catch (FileNotFoundException fnfe) {
+ throw new ParameterException(format("Cannot open file '%s': %s", file, fnfe.getMessage()));
+ }
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/ToolRunner.java b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java
new file mode 100644
index 0000000..90daeb3
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/ToolRunner.java
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.converters.FileConverter;
+import org.apache.any23.Any23;
+import org.apache.any23.plugin.Any23PluginManager;
+import org.apache.any23.util.LogUtils;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Properties;
+
+import static java.lang.System.currentTimeMillis;
+import static java.lang.System.exit;
+
+/**
+ * This class is the main class responsible to provide a uniform command-line
+ * access points to all the others tools like {@link Rover}.
+ *
+ * @see ExtractorDocumentation
+ * @see Rover
+ */
+public final class ToolRunner {
+
+ public static final File DEFAULT_PLUGIN_DIR = new File(new File(System.getProperty("user.home")), ".any23/plugins");
+
+ private static final PrintStream infoStream = System.err;
+
+ @Parameter( names = { "-h", "--help" }, description = "Display help information." )
+ private boolean printHelp;
+
+ @Parameter( names = { "-v", "--version" }, description = "Display version information." )
+ private boolean showVersion;
+
+ @Parameter( names = { "-X", "--verbose" }, description = "Produce execution verbose output." )
+ private boolean verbose;
+
+ @Parameter(
+ names = { "--plugins-dir" },
+ description = "The Any23 plugins directory.",
+ converter = FileConverter.class
+ )
+ private File pluginsDir = DEFAULT_PLUGIN_DIR;
+
+ public static void main( String[] args ) throws Exception {
+ exit( new ToolRunner().execute( args ) );
+ }
+
+ public int execute(String...args) throws Exception {
+ JCommander commander = new JCommander(this);
+ commander.setProgramName(System.getProperty("app.name"));
+
+ // TODO (low) : this dirty solution has been introduced because it is not possible to
+ // parse arguments ( commander.parse() ) twice.
+ final File pluginsDirOption;
+ try {
+ pluginsDirOption = parsePluginDirOption(args);
+ } catch (Exception e) {
+ System.err.println(e.getMessage());
+ return 1;
+ }
+ if(pluginsDirOption != null) {
+ pluginsDir = pluginsDirOption;
+ }
+
+ // add all plugins first
+ final Iterator<Tool> tools = getToolsInClasspath();
+ while (tools.hasNext()) {
+ Tool tool = tools.next();
+ commander.addCommand(tool);
+ }
+
+ commander.parse(args);
+
+ Map<String, JCommander> commands = commander.getCommands();
+ String parsedCommand = commander.getParsedCommand();
+
+ if (printHelp) {
+ commander.usage();
+ return 0;
+ }
+
+ if (showVersion) {
+ printVersionInfo();
+ return 0;
+ }
+
+ if(parsedCommand == null) {
+ infoStream.println("A command must be specified.");
+ commander.usage();
+ return 1;
+ }
+
+ if (verbose) {
+ LogUtils.setVerboseLogging();
+ } else {
+ LogUtils.setDefaultLogging();
+ }
+
+ long start = currentTimeMillis();
+ int exit = 0;
+
+ Throwable error = null;
+
+ // execute the parsed command
+ infoStream.println();
+ infoStream.println( "------------------------------------------------------------------------" );
+ infoStream.printf( "Apache Any23 :: %s%n", parsedCommand );
+ infoStream.println( "------------------------------------------------------------------------" );
+ infoStream.println();
+
+ try {
+ Tool.class.cast( commands.get( parsedCommand ).getObjects().get( 0 ) ).run();
+ } catch (Throwable t) {
+ exit = 1;
+ error = t;
+ } finally {
+ infoStream.println();
+ infoStream.println( "------------------------------------------------------------------------" );
+ infoStream.printf( "Apache Any23 %s%n", ( exit != 0 ) ? "FAILURE" : "SUCCESS" );
+
+ if (exit != 0) {
+ infoStream.println();
+
+ if (verbose) {
+ System.err.println( "Execution terminated with errors:" );
+ error.printStackTrace(infoStream);
+ } else {
+ infoStream.printf( "Execution terminated with errors: %s%n", error.getMessage() );
+ }
+
+ infoStream.println();
+ }
+
+ infoStream.printf( "Total time: %ss%n", ( ( currentTimeMillis() - start ) / 1000 ) );
+ infoStream.printf( "Finished at: %s%n", new Date() );
+
+ final Runtime runtime = Runtime.getRuntime();
+ final int megaUnit = 1024 * 1024;
+ infoStream.printf( "Final Memory: %sM/%sM%n", ( runtime.totalMemory() - runtime.freeMemory() ) / megaUnit,
+ runtime.totalMemory() / megaUnit );
+
+ infoStream.println( "------------------------------------------------------------------------" );
+ }
+
+ return exit;
+ }
+
+ Iterator<Tool> getToolsInClasspath() throws IOException {
+ final Any23PluginManager pluginManager = Any23PluginManager.getInstance();
+ if (pluginsDir.exists() && pluginsDir.isDirectory()) {
+ pluginManager.loadJARDir(pluginsDir);
+ }
+ return pluginManager.getTools();
+ }
+
+ private static void printVersionInfo() {
+ Properties properties = new Properties();
+ InputStream input = ToolRunner.class.getClassLoader().getResourceAsStream( "META-INF/maven/org.apache.any23/any23-core/pom.properties" );
+
+ if ( input != null ) {
+ try {
+ properties.load( input );
+ } catch ( IOException e ) {
+ // ignore, just don't load the properties
+ } finally {
+ try {
+ input.close();
+ } catch (IOException e) {
+ // close quietly
+ }
+ }
+ }
+
+ infoStream.printf( "Apache Any23 %s%n", Any23.VERSION );
+ infoStream.printf( "Java version: %s, vendor: %s%n",
+ System.getProperty( "java.version" ),
+ System.getProperty( "java.vendor" ) );
+ infoStream.printf( "Java home: %s%n", System.getProperty( "java.home" ) );
+ infoStream.printf( "Default locale: %s_%s, platform encoding: %s%n",
+ System.getProperty( "user.language" ),
+ System.getProperty( "user.country" ),
+ System.getProperty( "sun.jnu.encoding" ) );
+ infoStream.printf( "OS name: \"%s\", version: \"%s\", arch: \"%s\", family: \"%s\"%n",
+ System.getProperty( "os.name" ),
+ System.getProperty( "os.version" ),
+ System.getProperty( "os.arch" ),
+ getOsFamily() );
+ }
+
+ private static final String getOsFamily() {
+ String osName = System.getProperty( "os.name" ).toLowerCase();
+ String pathSep = System.getProperty( "path.separator" );
+
+ if (osName.contains("windows")) {
+ return "windows";
+ } else if (osName.contains("os/2")) {
+ return "os/2";
+ } else if (osName.contains("z/os") || osName.contains("os/390")) {
+ return "z/os";
+ } else if (osName.contains("os/400")) {
+ return "os/400";
+ } else if (pathSep.equals( ";" )) {
+ return "dos";
+ } else if (osName.contains("mac")) {
+ if (osName.endsWith("x")) {
+ return "mac"; // MACOSX
+ }
+ return "unix";
+ } else if (osName.contains("nonstop_kernel")) {
+ return "tandem";
+ } else if (osName.contains("openvms")) {
+ return "openvms";
+ } else if (pathSep.equals(":")) {
+ return "unix";
+ }
+
+ return "undefined";
+ }
+
+ private static File parsePluginDirOption(String[] args) {
+ int optionIndex = -1;
+ for(int i = 0; i < args.length; i++) {
+ if("--plugins-dir".equals(args[i])) {
+ optionIndex = i;
+ }
+ }
+ if(optionIndex == -1) return null;
+
+ if(optionIndex == args.length - 1) {
+ throw new IllegalArgumentException("Missing argument for --plugins-dir option.");
+ }
+ final File pluginsDir = new File( args[optionIndex + 1] );
+ if( ! pluginsDir.isDirectory() ) {
+ throw new IllegalArgumentException("Expected a directory for --plugins-dir option value.");
+ }
+ return pluginsDir;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java
new file mode 100644
index 0000000..7fde887
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/VocabPrinter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.apache.any23.vocab.RDFSchemaUtils;
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.eclipse.rdf4j.rio.RDFWriterRegistry;
+import org.eclipse.rdf4j.rio.Rio;
+
+import com.beust.jcommander.IStringConverter;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+
+/**
+ * Prints out the vocabulary <i>RDFSchema</i> as <i>NQuads</i>.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+@Parameters(commandNames = { "vocab" }, commandDescription = "Prints out the RDF Schema of the vocabularies used by Any23.")
+public class VocabPrinter implements Tool {
+
+ @Parameter(names = { "-f", "--format" }, description = "Vocabulary output format", converter = RDFFormatConverter.class)
+ private RDFFormat format = RDFFormat.NQUADS;
+
+ public void run() throws Exception {
+ RDFSchemaUtils.serializeVocabularies(format, System.out);
+ }
+
+ public static final class RDFFormatConverter implements
+ IStringConverter<RDFFormat> {
+
+ @Override
+ public RDFFormat convert(String value) {
+ return RDFWriterRegistry.getInstance().getFileFormatForMIMEType(value).orElseThrow(Rio.unsupportedFormat(value));
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/main/java/org/apache/any23/cli/package-info.java
----------------------------------------------------------------------
diff --git a/cli/src/main/java/org/apache/any23/cli/package-info.java b/cli/src/main/java/org/apache/any23/cli/package-info.java
new file mode 100644
index 0000000..40ae928
--- /dev/null
+++ b/cli/src/main/java/org/apache/any23/cli/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This package contains some command-line utilities which allow users
+ * to use the main <i>Any23</i> features via <i>commandline</i> shell.
+ */
+package org.apache.any23.cli;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java b/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java
new file mode 100644
index 0000000..98616ba
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/ExtractorDocumentationTest.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.junit.Test;
+
+/**
+ * Test case for {@link ExtractorDocumentation} CLI.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class ExtractorDocumentationTest extends ToolTestBase {
+
+ private static final String TARGET_EXTRACTOR = "html-microdata";
+
+ public ExtractorDocumentationTest() {
+ super(ExtractorDocumentation.class);
+ }
+
+ @Test
+ public void testList() throws Exception {
+ runToolCheckExit0("--list");
+ }
+
+ @Test
+ public void testAll() throws Exception {
+ runToolCheckExit0("--all");
+ }
+
+ //@Ignore("no available example")
+ @Test
+ public void testExampleInput() throws Exception {
+ runToolCheckExit0("-i", TARGET_EXTRACTOR);
+ }
+
+ //@Ignore("no available example")
+ @Test
+ public void testExampleOutput() throws Exception {
+ runToolCheckExit0("-o", TARGET_EXTRACTOR);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java
new file mode 100644
index 0000000..a80e729
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/MicrodataParserTest.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Test case for {@link MicrodataParser} CLI.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class MicrodataParserTest extends ToolTestBase {
+
+ public MicrodataParserTest() {
+ super(MicrodataParser.class);
+ }
+
+ @Test
+ public void testRunOnFile() throws Exception {
+ runToolCheckExit0("file:"+copyResourceToTempFile("/microdata/microdata-nested.html").getAbsolutePath());
+ }
+
+ @Ignore("ANY23-140 - Revise Any23 tests to remove fetching of web content")
+ @Test
+ public void testRunOnHTTPResource() throws Exception {
+ runToolCheckExit0("http://www.imdb.com/title/tt1375666/");
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java
new file mode 100644
index 0000000..3894d32
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/MimeDetectorTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.junit.Test;
+
+/**
+ * Test case for {@link MimeDetector} CLI.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class MimeDetectorTest extends ToolTestBase {
+
+ public MimeDetectorTest() {
+ super(MimeDetector.class);
+ }
+
+ @Test
+ public void testDetectURL() throws Exception {
+ assumeOnlineAllowed();
+ runToolCheckExit0("http://twitter.com#micmos");
+ }
+
+ @Test
+ public void testDetectFile() throws Exception {
+ assumeOnlineAllowed();
+ runToolCheckExit0("file://"+copyResourceToTempFile("/application/trix/test1.trx").getAbsolutePath());
+ }
+
+ @Test
+ public void testDetectInline() throws Exception {
+ assumeOnlineAllowed();
+ runToolCheckExit0( new String[] {"inline://<http://s> <http://p> <http://o> ."} );
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java b/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java
new file mode 100644
index 0000000..bdee9ae
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/PluginVerifierTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.junit.Test;
+
+/**
+ * Test case for {@link PluginVerifier} CLI.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class PluginVerifierTest extends ToolTestBase {
+
+ public PluginVerifierTest() {
+ super(PluginVerifier.class);
+ }
+
+ @Test
+ public void testRun() throws Exception {
+ runToolCheckExit0(".");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/RoverTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/RoverTest.java b/cli/src/test/java/org/apache/any23/cli/RoverTest.java
new file mode 100644
index 0000000..893220a
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/RoverTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.util.FileUtils;
+import org.apache.any23.util.StringUtils;
+import org.apache.any23.util.URLUtils;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.eclipse.rdf4j.model.Statement;
+import org.eclipse.rdf4j.rio.RDFFormat;
+
+import java.io.File;
+import java.util.Arrays;
+
+/**
+ * Test case for {@link Rover}.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+@Ignore("Twitter microdata not parsing correctly right now")
+public class RoverTest extends ToolTestBase {
+
+ private static final String[] TARGET_FILES = {
+ "/microdata/microdata-nested.html",
+ "/org/apache/any23/extractor/csv/test-semicolon.csv"
+ };
+
+ private static final String[] TARGET_URLS = {
+ "http://twitter.com/micmos",
+ "http://twitter.com/dpalmisano"
+ };
+
+ public RoverTest() {
+ super(Rover.class);
+ }
+
+ @Test
+ public void testRunMultiFiles() throws Exception {
+
+ String[] copiedTargets = new String[TARGET_FILES.length];
+ for(int i = 0; i < TARGET_FILES.length; i++)
+ {
+ File tempFile = copyResourceToTempFile(TARGET_FILES[i]);
+
+ copiedTargets[i] = tempFile.getAbsolutePath();
+ }
+
+ runWithMultiSourcesAndVerify(copiedTargets, 0);
+ }
+
+ @Test
+ public void testRunWithDefaultNS() throws Exception {
+ final String DEFAULT_GRAPH = "http://test/default/ns";
+ final File outFile = File.createTempFile("rover-test", "out", tempDirectory);
+ final int exitCode = runTool(
+ String.format(
+ "-o %s -f nquads -p -n %s -d %s",
+ outFile.getAbsolutePath(),
+ copyResourceToTempFile("/cli/rover-test1.nq").getAbsolutePath(),
+ DEFAULT_GRAPH
+ )
+ );
+
+ Assert.assertEquals("Unexpected exit code.", 0, exitCode);
+ Assert.assertTrue(outFile.exists());
+ final String fileContent = FileUtils.readFileContent(outFile);
+ final String[] lines = fileContent.split("\\n");
+ int graphCounter = 0;
+ for(String line : lines) {
+ if(line.contains(DEFAULT_GRAPH)) {
+ graphCounter++;
+ }
+ }
+ Assert.assertEquals(0, graphCounter);
+ }
+
+ /* BEGIN: online tests. */
+
+ @Test
+ public void testRunMultiURLs() throws Exception {
+ // Assuming first accessibility to remote resources.
+ assumeOnlineAllowed();
+ for(String targetURL : TARGET_URLS) {
+ Assume.assumeTrue( URLUtils.isOnline(targetURL) );
+ }
+
+ runWithMultiSourcesAndVerify(TARGET_URLS, 0);
+ }
+
+ private void runWithMultiSourcesAndVerify(String[] targets, int expectedExit) throws Exception {
+ final File outFile = File.createTempFile("rover-test", "out", tempDirectory);
+ final File logFile = File.createTempFile("rover-test", "log", tempDirectory);
+
+ final int exitCode = runTool(
+ String.format(
+ "-o %s -f nquads -l %s -p -n %s",
+ outFile.getAbsolutePath(),
+ logFile.getAbsolutePath(),
+ StringUtils.join(" ", targets)
+ )
+ );
+ Assert.assertEquals("Unexpected exit code.", expectedExit, exitCode);
+
+ Assert.assertTrue(outFile.exists());
+ Assert.assertTrue(logFile.exists());
+
+ final String logFileContent = FileUtils.readFileContent(logFile);
+ Assert.assertEquals(
+ "Unexpected number of log lines.",
+ targets.length + 1, // Header line.
+ StringUtils.countNL(logFileContent)
+ );
+
+ final String outNQuads = FileUtils.readFileContent(outFile);
+ final Statement[] statements = RDFUtils.parseRDF(RDFFormat.NQUADS, outNQuads);
+ System.out.println(Arrays.toString(statements));
+ Assert.assertTrue("Unexpected number of statements.", statements.length > 9);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java b/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java
new file mode 100644
index 0000000..881a782
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/ToolRunnerTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import junit.framework.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test case for {@link ToolRunner}.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class ToolRunnerTest {
+
+ private final Set<Class<? extends Tool>> coreTools = new HashSet<Class<? extends Tool>>(){{
+ add(ExtractorDocumentation.class);
+ add(MicrodataParser.class);
+ add(MimeDetector.class);
+ add(PluginVerifier.class);
+ add(Rover.class);
+ add(VocabPrinter.class);
+ }};
+
+ @Test
+ public void testGetToolsInClasspath() throws IOException {
+ Iterator<Tool> tools = new ToolRunner().getToolsInClasspath();
+ assertTrue("No core tools have been detected", tools.hasNext());
+ while (tools.hasNext()) {
+ assertTrue("Some core tools have not been detected.", coreTools.contains(tools.next().getClass()));
+ }
+ }
+
+ @Test
+ public void testGetVersion() throws Exception {
+ Assert.assertEquals(0, new ToolRunner().execute("-v") );
+ }
+
+ @Test
+ public void testGetHelp() throws Exception {
+ Assert.assertEquals(0, new ToolRunner().execute("-h") );
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java
new file mode 100644
index 0000000..fef49cd
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/ToolTestBase.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import com.beust.jcommander.Parameters;
+import org.apache.any23.Any23OnlineTestBase;
+
+import java.util.Arrays;
+
+import static java.lang.String.format;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Base class for <i>CLI</i> related tests.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+// TODO: improve support for Tool testing, intercept i/o streams.
+public abstract class ToolTestBase extends Any23OnlineTestBase {
+
+ public static final String TOOL_RUN_METHOD = "run";
+
+ private final Class<? extends Tool> toolClazz;
+
+ protected ToolTestBase(Class<? extends Tool> tool) {
+ if (tool == null) throw new NullPointerException();
+ toolClazz = tool;
+ }
+
+ /**
+ * Runs the underlying tool.
+ *
+ * @param args tool arguments.
+ * @return the tool exit code.
+ * @throws Exception
+ */
+ protected int runTool(String... args) throws Exception {
+ final String commandName = toolClazz.getAnnotation( Parameters.class ).commandNames()[0];
+
+ final String[] enhancedArgs = new String[args.length + 1];
+ enhancedArgs[0] = commandName;
+ System.arraycopy( args, 0, enhancedArgs, 1, args.length );
+
+ return new ToolRunner().execute( enhancedArgs );
+ }
+
+ /**
+ * Runs the underlying tool.
+ *
+ * @param args args tool arguments.
+ * @return the tool exit code.
+ * @throws Exception
+ */
+ protected int runTool(String args) throws Exception {
+ return runTool(args.split(" "));
+ }
+
+ /**
+ * Runs the underlying tool and verify the exit code to <code>0</code>.
+ *
+ * @param args tool arguments.
+ * @throws Exception
+ */
+ protected void runToolCheckExit0(String... args) throws Exception {
+ assertEquals(
+ format(
+ "Unexpected exit code for tool [%s] invoked with %s",
+ toolClazz.getSimpleName(),
+ Arrays.asList(args)
+ ),
+ 0,
+ runTool(args)
+ );
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java b/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java
new file mode 100644
index 0000000..1c841dc
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/VocabPrinterTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.junit.Test;
+
+/**
+ * Test case for {@link VocabPrinter} CLI.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class VocabPrinterTest extends ToolTestBase {
+
+ public VocabPrinterTest() {
+ super(VocabPrinter.class);
+ }
+
+ @Test
+ public void testRun() throws Exception {
+ runToolCheckExit0();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/242b130b/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java b/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java
deleted file mode 100644
index 9a0410b..0000000
--- a/core/src/main/java/org/apache/any23/cli/ExtractorDocumentation.java
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.any23.cli;
-
-import com.beust.jcommander.Parameter;
-import com.beust.jcommander.Parameters;
-import org.apache.any23.extractor.ExampleInputOutput;
-import org.apache.any23.extractor.ExtractionException;
-import org.apache.any23.extractor.Extractor;
-import org.apache.any23.extractor.ExtractorRegistryImpl;
-import org.apache.any23.extractor.Extractor.BlindExtractor;
-import org.apache.any23.extractor.Extractor.ContentExtractor;
-import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
-import org.apache.any23.extractor.ExtractorFactory;
-import org.apache.any23.extractor.ExtractorRegistry;
-
-import java.io.IOException;
-import java.util.LinkedList;
-import java.util.List;
-
-/**
- * This class provides some command-line documentation
- * about available extractors and their usage.
- */
-@Parameters( commandNames = { "extractor" }, commandDescription= "Utility for obtaining documentation about metadata extractors.")
-public class ExtractorDocumentation implements Tool {
-
- @Parameter( names = { "-l", "--list" }, description = "shows the names of all available extractors" )
- private boolean showList;
-
- @Parameter( names = { "-i", "--input" }, description = "shows example input for the given extractor" )
- private boolean showInput;
-
- @Parameter( names = { "-o", "--outut" }, description = "shows example output for the given extractor" )
- private boolean showOutput;
-
- @Parameter( names = { "-a", "--all" }, description = "shows a report about all available extractors" )
- private boolean showAll;
-
- @Parameter( arity = 1, description = "Extractor name" )
- private List<String> extractor = new LinkedList<String>();
-
- public void run() throws Exception {
- if (showList) {
- printExtractorList(ExtractorRegistryImpl.getInstance());
- } else if (showInput) {
- if (extractor.isEmpty()) {
- throw new IllegalArgumentException("Required argument for -i: extractor name");
- }
-
- printExampleInput(extractor.get(0), ExtractorRegistryImpl.getInstance());
- } else if (showOutput) {
- if (extractor.isEmpty()) {
- throw new IllegalArgumentException("Required argument for -o: extractor name");
- }
-
- printExampleOutput(extractor.get(0), ExtractorRegistryImpl.getInstance());
- } else if (showAll) {
- printReport(ExtractorRegistryImpl.getInstance());
- }
- }
-
- /**
- * Print an error message.
- *
- * @param msg the error message to be printed
- */
- public void printError(String msg) {
- System.err.println(msg);
- }
-
- /**
- * Prints the list of all the available extractors.
- * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
- * containing all extractors
- */
- public void printExtractorList(ExtractorRegistry registry) {
- for (ExtractorFactory factory : registry.getExtractorGroup()) {
- System.out.println( String.format("%25s [%15s]", factory.getExtractorName(), factory.getExtractorLabel()));
- }
- }
-
- /**
- * Prints an example of input for the provided extractor.
- *
- * @param extractorName the name of the extractor
- * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
- * containing all extractors
- * @throws IOException raised if no extractor is found with that name
- */
- public void printExampleInput(String extractorName, ExtractorRegistry registry) throws IOException {
- ExtractorFactory<?> factory = getFactory(registry, extractorName);
- ExampleInputOutput example = new ExampleInputOutput(factory);
- String input = example.getExampleInput();
- if (input == null) {
- throw new IllegalArgumentException("Extractor " + extractorName + " provides no example input");
- }
- System.out.println(input);
- }
-
- /**
- * Prints an output example for the given extractor.
- *
- * @param extractorName the extractor name
- * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
- * containing all extractors
- * @throws IOException raised if no extractor is found with that name
- * @throws ExtractionException if there is an error duing extraction
- */
- public void printExampleOutput(String extractorName, ExtractorRegistry registry) throws IOException, ExtractionException {
- ExtractorFactory<?> factory = getFactory(registry, extractorName);
- ExampleInputOutput example = new ExampleInputOutput(factory);
- String output = example.getExampleOutput();
- if (output == null) {
- throw new IllegalArgumentException("Extractor " + extractorName + " provides no example output");
- }
- System.out.println(output);
- }
-
- /**
- * Prints a complete report on all the available extractors.
- *
- * @param registry the {@link org.apache.any23.extractor.ExtractorRegistry}
- * containing all extractors
- * @throws IOException raised if no extractor is found with that name
- * @throws ExtractionException if there is an error duing extraction
- */
- public void printReport(ExtractorRegistry registry) throws IOException, ExtractionException {
- for (String extractorName : registry.getAllNames()) {
- ExtractorFactory<?> factory = registry.getFactory(extractorName);
- ExampleInputOutput example = new ExampleInputOutput(factory);
- System.out.println("Extractor: " + extractorName);
- System.out.println("\ttype: " + getType(factory));
- System.out.println();
- final String exampleInput = example.getExampleInput();
- if(exampleInput == null) {
- System.out.println("(No Example Available)");
- } else {
- System.out.println("-------- Example Input --------");
- System.out.println(exampleInput);
- System.out.println("-------- Example Output --------");
- String output = example.getExampleOutput();
- System.out.println(output == null || output.trim().length() == 0 ? "(No Output Generated)" : output);
- }
- System.out.println("================================");
- System.out.println();
- }
- }
-
- private ExtractorFactory<?> getFactory(ExtractorRegistry registry, String name) {
- if (!registry.isRegisteredName(name)) {
- throw new IllegalArgumentException("Unknown extractor name: " + name);
- }
- return registry.getFactory(name);
- }
-
- private String getType(ExtractorFactory<?> factory) {
- Extractor<?> extractor = factory.createExtractor();
- if (extractor instanceof BlindExtractor) {
- return BlindExtractor.class.getSimpleName();
- }
- if (extractor instanceof TagSoupDOMExtractor) {
- return TagSoupDOMExtractor.class.getSimpleName();
- }
- if (extractor instanceof ContentExtractor) {
- return ContentExtractor.class.getSimpleName();
- }
- return "?";
- }
-
-}