You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2018/02/28 04:43:11 UTC
[01/10] any23 git commit: ANY23-321 Add openie toggle functionality
to service
Repository: any23
Updated Branches:
refs/heads/master 66ce1241a -> 394d36a0c
ANY23-321 Add openie toggle functionality to service
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/706e891c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/706e891c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/706e891c
Branch: refs/heads/master
Commit: 706e891cf582736f90cfbe83bc1ef5d629e6dfd7
Parents: 0613280
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Jan 3 00:05:39 2018 +0000
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Jan 3 00:05:39 2018 +0000
----------------------------------------------------------------------
.../apache/any23/extractor/ExtractorGroup.java | 1 +
.../apache/any23/plugin/Any23PluginManager.java | 23 +--
core/src/main/java/org/apache/any23/Any23.java | 8 +-
.../any23/extractor/ExtractorRegistryImpl.java | 11 +-
openie/pom.xml | 152 -----------------
.../any23/extractor/openie/OpenIEExtractor.java | 130 ---------------
.../openie/OpenIEExtractorFactory.java | 52 ------
.../org.apache.any23.extractor.ExtractorFactory | 1 -
.../any23/openie/OpenIEExtractorTest.java | 88 ----------
.../htmlscraper/HTMLScraperExtractor.java | 12 +-
plugins/integration-test/pom.xml | 5 +
.../java/org/apache/any23/plugin/PluginIT.java | 11 +-
plugins/openie/pom.xml | 165 +++++++++++++++++++
.../extractor/openie/OpenIEExtractor.java | 137 +++++++++++++++
.../openie/OpenIEExtractorFactory.java | 52 ++++++
.../org.apache.any23.extractor.ExtractorFactory | 1 +
.../any23/openie/OpenIEExtractorTest.java | 88 ++++++++++
pom.xml | 6 +-
service/README.md | 49 ++++++
service/README.txt | 50 ------
service/pom.xml | 84 +++++++++-
.../java/org/apache/any23/servlet/Servlet.java | 48 +++++-
service/src/main/resources/form.html | 59 ++++++-
.../main/webapp/resources/js/bootstrap-modal.js | 22 ++-
src/site/apt/any23-plugins.apt | 9 +-
25 files changed, 734 insertions(+), 530 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java b/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java
index 9242ea6..4e77690 100644
--- a/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java
+++ b/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java
@@ -61,6 +61,7 @@ public class ExtractorGroup implements Iterable<ExtractorFactory<?>> {
return new ExtractorGroup(matching);
}
+ @Override
public Iterator<ExtractorFactory<?>> iterator() {
return factories.iterator();
}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java b/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java
index 5898210..3cd0829 100644
--- a/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java
+++ b/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java
@@ -116,7 +116,7 @@ public class Any23PluginManager {
* @return list of exceptions raised during the loading.
*/
public synchronized Throwable[] loadJARs(File... jars) {
- final List<Throwable> result = new ArrayList<Throwable>();
+ final List<Throwable> result = new ArrayList<>();
for (File jar : jars) {
try {
loadJAR(jar);
@@ -158,7 +158,7 @@ public class Any23PluginManager {
* @return list of exceptions raised during the loading.
*/
public synchronized Throwable[] loadClassDirs(File... classDirs) {
- final List<Throwable> result = new ArrayList<Throwable>();
+ final List<Throwable> result = new ArrayList<>();
for (File classDir : classDirs) {
try {
loadClassDir(classDir);
@@ -178,14 +178,15 @@ public class Any23PluginManager {
* Loads all the JARs detected in a given directory.
*
* @param jarDir directory containing the JARs to be loaded.
+ * Example '/usr/local/apache-tomcat-7.0.72/webapps/apache-any23-service-2.2-SNAPSHOT/WEB-INF/lib/apache-any23-openie'
* @return <code>true</code> if all JARs in dir are loaded.
*/
public synchronized boolean loadJARDir(File jarDir) {
if(jarDir == null)
throw new NullPointerException("JAR dir must be not null.");
- if( ! jarDir.exists() )
+ if(!jarDir.exists() )
throw new IllegalArgumentException("Given directory doesn't exist:" + jarDir.getAbsolutePath());
- if(! jarDir.isDirectory() )
+ if(!jarDir.isDirectory() )
throw new IllegalArgumentException(
"given file exists and it is not a directory: " + jarDir.getAbsolutePath()
);
@@ -210,7 +211,7 @@ public class Any23PluginManager {
* @return list of errors occurred during loading.
*/
public synchronized Throwable[] loadFiles(File... files) {
- final List<Throwable> errors = new ArrayList<Throwable>();
+ final List<Throwable> errors = new ArrayList<>();
for(File file : files) {
try {
if (file.isFile() && file.getName().endsWith(".jar")) {
@@ -263,6 +264,7 @@ public class Any23PluginManager {
* @return not <code>null</code> list of plugin classes.
* @throws IOException if there is an error obtaining Extractors.
*/
+ @SuppressWarnings("rawtypes")
public synchronized Iterator<ExtractorFactory> getExtractors() throws IOException {
return getPlugins(ExtractorFactory.class);
}
@@ -312,7 +314,8 @@ public class Any23PluginManager {
final StringBuilder report = new StringBuilder();
try {
- final List<ExtractorFactory<?>> newFactoryList = new ArrayList<ExtractorFactory<?>>();
+ final List<ExtractorFactory<?>> newFactoryList = new ArrayList<>();
+ @SuppressWarnings("rawtypes")
Iterator<ExtractorFactory> extractors = getExtractors();
while (extractors.hasNext()) {
ExtractorFactory<?> factory = extractors.next();
@@ -386,7 +389,7 @@ public class Any23PluginManager {
*/
private File[] getPluginLocations(String pluginDirsList) {
final String[] locationsStr = pluginDirsList.split(PLUGIN_DIRS_LIST_SEPARATOR);
- final List<File> locations = new ArrayList<File>();
+ final List<File> locations = new ArrayList<>();
for(String locationStr : locationsStr) {
final File location = new File(locationStr);
if( ! location.exists()) {
@@ -404,7 +407,7 @@ public class Any23PluginManager {
*/
private static final class DynamicClassLoader extends URLClassLoader {
- private final Set<String> addedURLs = new HashSet<String>();
+ private final Set<String> addedURLs = new HashSet<>();
private final List<File> jars;
@@ -412,8 +415,8 @@ public class Any23PluginManager {
public DynamicClassLoader(URL[] urls) {
super(urls, Any23PluginManager.class.getClassLoader());
- jars = new ArrayList<File>();
- dirs = new ArrayList<File>();
+ jars = new ArrayList<>();
+ dirs = new ArrayList<>();
}
public DynamicClassLoader() {
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/core/src/main/java/org/apache/any23/Any23.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/Any23.java b/core/src/main/java/org/apache/any23/Any23.java
index 9be8a28..cba13d8 100644
--- a/core/src/main/java/org/apache/any23/Any23.java
+++ b/core/src/main/java/org/apache/any23/Any23.java
@@ -98,7 +98,8 @@ public class Any23 {
* @param extractorGroup the group of extractors to be applied.
*/
public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
- if(configuration == null) throw new NullPointerException("configuration must be not null.");
+ if(configuration == null)
+ throw new NullPointerException("configuration must be not null.");
this.configuration = configuration;
logger.debug( configuration.getConfigurationDump() );
@@ -259,7 +260,8 @@ public class Any23 {
* @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
*/
public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException {
- if(documentIRI == null) throw new NullPointerException("documentIRI cannot be null.");
+ if(documentIRI == null)
+ throw new NullPointerException("documentIRI cannot be null.");
if (documentIRI.toLowerCase().startsWith("file:")) {
return new FileDocumentSource( new File(new URI(documentIRI)) );
}
@@ -453,7 +455,7 @@ public class Any23 {
}
private String getAcceptHeader() {
- Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
+ Collection<MIMEType> mimeTypes = new ArrayList<>();
for (ExtractorFactory<?> factory : factories) {
mimeTypes.addAll(factory.getSupportedMIMETypes());
}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java
index 86dc982..ca3bb98 100644
--- a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java
+++ b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java
@@ -30,9 +30,15 @@ import java.util.List;
* Singleton class acting as a register for all the various
* {@link Extractor}.
*/
+@SuppressWarnings("rawtypes")
public class ExtractorRegistryImpl extends org.eclipse.rdf4j.common.lang.service.ServiceRegistry<String, ExtractorFactory> implements ExtractorRegistry {
/**
+ * The instance.
+ */
+ private static ExtractorRegistry instance = null;
+
+ /**
* Public constructor for ExtractorRegistryImpl. Should normally call getInstance.
*/
public ExtractorRegistryImpl() {
@@ -40,11 +46,6 @@ public class ExtractorRegistryImpl extends org.eclipse.rdf4j.common.lang.service
}
/**
- * The instance.
- */
- private static ExtractorRegistry instance = null;
-
- /**
* @return returns the {@link ExtractorRegistry} instance.
*/
public static ExtractorRegistry getInstance() {
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/pom.xml
----------------------------------------------------------------------
diff --git a/openie/pom.xml b/openie/pom.xml
deleted file mode 100644
index 7440812..0000000
--- a/openie/pom.xml
+++ /dev/null
@@ -1,152 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <artifactId>apache-any23</artifactId>
- <groupId>org.apache.any23</groupId>
- <version>2.2-SNAPSHOT</version>
- <relativePath>../</relativePath>
- </parent>
-
- <repositories>
- <repository>
- <snapshots>
- <enabled>false</enabled>
- </snapshots>
- <id>bintray-allenai-maven</id>
- <name>bintray</name>
- <url>http://allenai.bintray.com/maven</url>
- </repository>
- </repositories>
- <pluginRepositories>
- <pluginRepository>
- <snapshots>
- <enabled>false</enabled>
- </snapshots>
- <id>bintray-allenai-maven</id>
- <name>bintray-plugins</name>
- <url>http://allenai.bintray.com/maven</url>
- </pluginRepository>
- </pluginRepositories>
-
- <artifactId>apache-any23-openie</artifactId>
-
- <name>Apache Any23 :: OpenIE</name>
- <description>Open Information Extraction module.</description>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>apache-any23-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>apache-any23-test-resources</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- <type>test-jar</type>
- </dependency>
- <dependency>
- <groupId>org.allenai.openie</groupId>
- <artifactId>openie_2.11</artifactId>
- <version>4.2.6</version>
- <scope>compile</scope>
- </dependency>
- <dependency>
- <groupId>org.allenai.openie</groupId>
- <artifactId>openie_2.11</artifactId>
- <version>4.2.6</version>
- <scope>compile</scope>
- <type>pom</type>
- </dependency>
- <dependency>
- <groupId>edu.washington.cs.knowitall</groupId>
- <artifactId>openregex</artifactId>
- <version>1.1.1</version>
- <scope>runtime</scope>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <configuration>
- <skipTests>true</skipTests>
- </configuration>
- </plugin>
- </plugins>
- <pluginManagement>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-assembly-plugin</artifactId>
- <version>${maven-assembly-plugin.version}</version>
- <executions>
- <execution>
- <id>assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- </execution>
- </executions>
- <configuration>
- <attach>true</attach>
- <skipAssembly>true</skipAssembly>
- <tarLongFileMode>gnu</tarLongFileMode>
- </configuration>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
-
- <profiles>
- <profile>
- <id>release</id>
- <build>
- <resources>
- <resource>
- <directory>${basedir}/../</directory>
- <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
- <includes>
- <include>LICENSE.txt</include>
- <include>NOTICE.txt</include>
- </includes>
- </resource>
- </resources>
- </build>
- </profile>
-
- </profiles>
-
-</project>
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java
----------------------------------------------------------------------
diff --git a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java b/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java
deleted file mode 100644
index 812ed9c..0000000
--- a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.any23.extractor.openie;
-
-import java.io.IOException;
-import java.util.List;
-
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.TransformerFactoryConfigurationError;
-
-import org.apache.any23.extractor.Extractor;
-import org.apache.any23.configuration.Configuration;
-import org.apache.any23.configuration.DefaultConfiguration;
-import org.apache.any23.extractor.ExtractionContext;
-import org.apache.any23.extractor.ExtractorDescription;
-import org.apache.any23.rdf.RDFUtils;
-import org.apache.any23.util.StreamUtils;
-import org.apache.tika.Tika;
-import org.apache.tika.exception.TikaException;
-import org.eclipse.rdf4j.model.IRI;
-import org.eclipse.rdf4j.model.Resource;
-import org.eclipse.rdf4j.model.Value;
-import org.eclipse.rdf4j.model.vocabulary.RDF;
-import org.eclipse.rdf4j.model.vocabulary.RDFS;
-import org.apache.any23.extractor.ExtractionException;
-import org.apache.any23.extractor.ExtractionParameters;
-import org.apache.any23.extractor.ExtractionResult;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-
-import edu.knowitall.openie.Argument;
-import edu.knowitall.openie.Instance;
-import edu.knowitall.openie.OpenIE;
-import edu.knowitall.tool.parse.ClearParser;
-import edu.knowitall.tool.postag.ClearPostagger;
-import edu.knowitall.tool.srl.ClearSrl;
-import edu.knowitall.tool.tokenize.ClearTokenizer;
-import scala.collection.JavaConversions;
-import scala.collection.Seq;
-
-/**
- * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a>
- * extractor able to generate <i>RDF</i> statements from
- * sentences representing relations in the text.
- */
-public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {
-
- private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);
-
- private IRI documentRoot;
-
- /**
- * default constructor
- */
- public OpenIEExtractor() {
- // default constructor
- }
-
- /**
- * @see org.apache.any23.extractor.Extractor#getDescription()
- */
- @Override
- public ExtractorDescription getDescription() {
- return OpenIEExtractorFactory.getDescriptionInstance();
- }
-
- @Override
- public void run(ExtractionParameters extractionParameters,
- ExtractionContext context, Document in, ExtractionResult out)
- throws IOException, ExtractionException {
-
- IRI documentIRI = context.getDocumentIRI();
- documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
- out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
- out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
- LOG.debug("Processing: {}", documentIRI.toString());
-
- OpenIE openIE = new OpenIE(
- new ClearParser(
- new ClearPostagger(
- new ClearTokenizer())), new ClearSrl(), false, false);
-
- Seq<Instance> extractions = null;
- Tika tika = new Tika();
- try {
- extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
- } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
- LOG.error("Encountered error during OpenIE extraction.", e);
- } catch (TikaException e) {
- LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
- }
-
- List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
- // for each extraction instance we can obtain a number of extraction elements
- // instance.confidence() - a confidence value for the extraction itself
- // instance.extr().context() - an optional representation of the context for this extraction
- // instance.extr().arg1().text() - subject
- // instance.extr().rel().text() - predicate
- // instance.extr().arg2s().text() - object
- final Configuration immutableConf = DefaultConfiguration.singleton();
- Double threshold = Double.parseDouble(immutableConf.getProperty("any23.extraction.openie.confidence.threshold", "0.5"));
- for(Instance instance : listExtractions) {
- if (instance.confidence() > threshold) {
- List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
- for(Argument argument : listArg2s) {
- Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
- IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
- Value object = RDFUtils.toValue(argument.text());
- out.writeTriple(subject, predicate, object);
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java
----------------------------------------------------------------------
diff --git a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java b/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java
deleted file mode 100644
index 31760d2..0000000
--- a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.any23.extractor.openie;
-
-import java.util.Arrays;
-
-import org.apache.any23.extractor.ExtractorDescription;
-import org.apache.any23.extractor.ExtractorFactory;
-import org.apache.any23.extractor.SimpleExtractorFactory;
-import org.apache.any23.rdf.Prefixes;
-
-/**
- * @author lewismc
- *
- */
-public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor>
- implements ExtractorFactory<OpenIEExtractor> {
-
- public static final String NAME = "openie";
-
- public static final Prefixes prefixes = null;
-
- private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory();
-
- public OpenIEExtractorFactory() {
- super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html");
- }
-
- @Override
- public OpenIEExtractor createExtractor() {
- return new OpenIEExtractor();
- }
-
- public static ExtractorDescription getDescriptionInstance() {
- return descriptionInstance;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
----------------------------------------------------------------------
diff --git a/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
deleted file mode 100644
index 4faf7ce..0000000
--- a/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.any23.extractor.openie.OpenIEExtractorFactory
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
----------------------------------------------------------------------
diff --git a/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
deleted file mode 100644
index 9455311..0000000
--- a/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.any23.openie;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-
-import org.apache.any23.extractor.ExtractionContext;
-import org.apache.any23.extractor.ExtractionException;
-import org.apache.any23.extractor.ExtractionParameters;
-import org.apache.any23.extractor.ExtractionResult;
-import org.apache.any23.extractor.ExtractionResultImpl;
-import org.apache.any23.extractor.openie.OpenIEExtractor;
-import org.apache.any23.rdf.RDFUtils;
-import org.apache.any23.util.StreamUtils;
-import org.apache.any23.writer.RDFXMLWriter;
-import org.apache.any23.writer.TripleHandler;
-import org.apache.any23.writer.TripleHandlerException;
-import org.eclipse.rdf4j.model.IRI;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * @author lewismc
- *
- */
-public class OpenIEExtractorTest {
-
- private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class);
-
- private OpenIEExtractor extractor;
-
- @Before
- public void setUp() throws Exception {
- extractor = new OpenIEExtractor();
- }
-
- @After
- public void tearDown() throws Exception {
- extractor = null;
- }
-
- @Test
- public void testExtractFromHTMLDocument()
- throws IOException, ExtractionException, TripleHandlerException {
- final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/aquarius");
- extract(uri, "/org/apache/any23/extractor/openie/example-openie.html");
- }
-
- public void extract(IRI uri, String filePath)
- throws IOException, ExtractionException, TripleHandlerException {
- FileOutputStream fos = new FileOutputStream(File.createTempFile("OpenIEExtractorTest", "tmp"));
- final TripleHandler tHandler = new RDFXMLWriter(fos);
- final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri);
- final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler);
- try {
- extractor.run(
- ExtractionParameters.newDefault(),
- extractionContext,
- StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)),
- result
- );
- } finally {
- logger.debug(fos.toString());
- tHandler.close();
- result.close();
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java
----------------------------------------------------------------------
diff --git a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java
index ab7d34a..94a3210 100644
--- a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java
+++ b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java
@@ -46,16 +46,16 @@ import java.util.List;
*/
public class HTMLScraperExtractor implements Extractor.ContentExtractor {
- public final static IRI PAGE_CONTENT_DE_PROPERTY =
+ public static final IRI PAGE_CONTENT_DE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/de");
- public final static IRI PAGE_CONTENT_AE_PROPERTY =
+ public static final IRI PAGE_CONTENT_AE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ae");
- public final static IRI PAGE_CONTENT_LCE_PROPERTY =
+ public static final IRI PAGE_CONTENT_LCE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/lce");
- public final static IRI PAGE_CONTENT_CE_PROPERTY =
+ public static final IRI PAGE_CONTENT_CE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ce");
- private final List<ExtractionRule> extractionRules = new ArrayList<ExtractionRule>();
+ private final List<ExtractionRule> extractionRules = new ArrayList<>();
public HTMLScraperExtractor() {
loadDefaultRules();
@@ -66,7 +66,7 @@ public class HTMLScraperExtractor implements Extractor.ContentExtractor {
}
public String[] getTextExtractors() {
- final List<String> extractors = new ArrayList<String>();
+ final List<String> extractors = new ArrayList<>();
for(ExtractionRule er : extractionRules) {
extractors.add(er.name);
}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/integration-test/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/integration-test/pom.xml b/plugins/integration-test/pom.xml
index ab062cd..c99a7e0 100644
--- a/plugins/integration-test/pom.xml
+++ b/plugins/integration-test/pom.xml
@@ -57,6 +57,11 @@
<artifactId>apache-any23-basic-crawler</artifactId>
<version>1.0.6-SNAPSHOT</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.any23.plugins</groupId>
+ <artifactId>apache-any23-openie</artifactId>
+ <version>${project.parent.version}</version>
+ </dependency>
<!-- BEGIN: Test Dependencies -->
<dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
----------------------------------------------------------------------
diff --git a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
index 1b69463..e8e4505 100644
--- a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
+++ b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
@@ -56,6 +56,9 @@ public class PluginIT {
private static final File CRAWLER_TARGET_DIR = new File(PLUGIN_DIR + "basic-crawler/target/classes");
private static final File CRAWLER_DEPENDENCY_DIR = new File(PLUGIN_DIR + "basic-crawler/target/dependency");
+ private static final File OPENIE_TARGET_DIR = new File(PLUGIN_DIR + "openie/target/classes");
+ private static final File OPENIE_DEPENDENCY_DIR = new File(PLUGIN_DIR + "openie/target/dependency");
+
private Any23PluginManager manager;
@Before
@@ -79,13 +82,15 @@ public class PluginIT {
public void testDetectExtractorPlugins() throws IOException, InstantiationException, IllegalAccessException {
final ExtractorGroup extractorGroup = manager.getApplicableExtractors(
new ExtractorRegistryImpl(),
- HTML_SCRAPER_TARGET_DIR, // Required to satisfy class dependencies.
+ HTML_SCRAPER_TARGET_DIR,
HTML_SCRAPER_DEPENDENCY_DIR,
OFFICE_SCRAPER_TARGET_DIR,
- OFFICE_SCRAPER_DEPENDENCY_DIR // Required to satisfy class dependencies.
+ OFFICE_SCRAPER_DEPENDENCY_DIR,
+ OPENIE_TARGET_DIR,
+ OPENIE_DEPENDENCY_DIR
);
try {
- Class.forName("org.apache.any23.extractor.openie.OpenIEExtractor", false, this.getClass().getClassLoader());
+ Class.forName("org.apache.any23.plugin.extractor.openie.OpenIEExtractor", false, this.getClass().getClassLoader());
assertEquals("Did not find the number of expected extractors", NUM_OF_EXTRACTORS_INCL_OPENIE ,
extractorGroup.getNumOfExtractors()
);
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/openie/pom.xml b/plugins/openie/pom.xml
new file mode 100644
index 0000000..64c6806
--- /dev/null
+++ b/plugins/openie/pom.xml
@@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23</artifactId>
+ <version>2.2-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+
+ <groupId>org.apache.any23.plugins</groupId>
+ <artifactId>apache-any23-openie</artifactId>
+
+ <name>Apache Any23 :: Plugins :: OpenIE</name>
+ <description>Open Information Extraction module.</description>
+
+ <repositories>
+ <repository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </repository>
+ </repositories>
+ <pluginRepositories>
+ <pluginRepository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray-plugins</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </pluginRepository>
+ </pluginRepositories>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23-test-resources</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>compile</scope>
+ <type>pom</type>
+ </dependency>
+ <dependency>
+ <groupId>edu.washington.cs.knowitall</groupId>
+ <artifactId>openregex</artifactId>
+ <version>${openregex.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <skipTests>true</skipTests>
+ </configuration>
+ </plugin>
+ <!-- Generates the distribution package -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <appendAssemblyId>false</appendAssemblyId>
+ <descriptors>
+ <descriptor>${basedir}/src/main/assembly/bin.xml</descriptor>
+ </descriptors>
+ </configuration>
+ </plugin>
+ </plugins>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>${maven-assembly-plugin.version}</version>
+ <executions>
+ <execution>
+ <id>assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <attach>true</attach>
+ <skipAssembly>true</skipAssembly>
+ <tarLongFileMode>gnu</tarLongFileMode>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>release</id>
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </build>
+ </profile>
+
+ </profiles>
+
+</project>
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java
----------------------------------------------------------------------
diff --git a/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java
new file mode 100644
index 0000000..1b6a9cf
--- /dev/null
+++ b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.plugin.extractor.openie;
+
+import java.io.IOException;
+import java.util.List;
+
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactoryConfigurationError;
+
+import org.apache.any23.extractor.Extractor;
+import org.apache.any23.configuration.Configuration;
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.plugin.Author;
+import org.apache.any23.plugin.ExtractorPlugin;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.util.StreamUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Resource;
+import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
+import org.eclipse.rdf4j.model.vocabulary.RDFS;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionResult;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+
+import edu.knowitall.openie.Argument;
+import edu.knowitall.openie.Instance;
+import edu.knowitall.openie.OpenIE;
+import edu.knowitall.tool.parse.ClearParser;
+import edu.knowitall.tool.postag.ClearPostagger;
+import edu.knowitall.tool.srl.ClearSrl;
+import edu.knowitall.tool.tokenize.ClearTokenizer;
+import scala.collection.JavaConversions;
+import scala.collection.Seq;
+
+/**
+ * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a>
+ * extractor able to generate <i>RDF</i> statements from
+ * sentences representing relations in the text.
+ */
+@Author(name="Lewis John McGibbney (lewismc@apache.org)")
+public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor, ExtractorPlugin {
+
+ private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);
+
+ /**
+ * default constructor
+ */
+ public OpenIEExtractor() {
+ // default constructor
+ }
+
+ /**
+ * @see org.apache.any23.extractor.Extractor#getDescription()
+ */
+ @Override
+ public ExtractorDescription getDescription() {
+ return OpenIEExtractorFactory.getDescriptionInstance();
+ }
+
+ @Override
+ public void run(ExtractionParameters extractionParameters,
+ ExtractionContext context, Document in, ExtractionResult out)
+ throws IOException, ExtractionException {
+
+ IRI documentIRI = context.getDocumentIRI();
+ RDFUtils.iri(documentIRI.toString() + "root");
+ out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
+ out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
+ LOG.debug("Processing: {}", documentIRI.toString());
+
+ OpenIE openIE = new OpenIE(
+ new ClearParser(
+ new ClearPostagger(
+ new ClearTokenizer())), new ClearSrl(), false, false);
+
+ Seq<Instance> extractions = null;
+ Tika tika = new Tika();
+ try {
+ extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
+ } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
+ LOG.error("Encountered error during OpenIE extraction.", e);
+ } catch (TikaException e) {
+ LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
+ }
+
+ List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
+ // for each extraction instance we can obtain a number of extraction elements
+ // instance.confidence() - a confidence value for the extraction itself
+ // instance.extr().context() - an optional representation of the context for this extraction
+ // instance.extr().arg1().text() - subject
+ // instance.extr().rel().text() - predicate
+ // instance.extr().arg2s().text() - object
+ final Configuration immutableConf = DefaultConfiguration.singleton();
+ Double threshold = Double.parseDouble(immutableConf.getProperty("any23.extraction.openie.confidence.threshold", "0.5"));
+ for(Instance instance : listExtractions) {
+ if (instance.confidence() > threshold) {
+ List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
+ for(Argument argument : listArg2s) {
+ Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
+ IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
+ Value object = RDFUtils.toValue(argument.text());
+ out.writeTriple(subject, predicate, object);
+ }
+ }
+ }
+ }
+
+ @Override
+ public ExtractorFactory<?> getExtractorFactory() {
+ return (ExtractorFactory<?>) OpenIEExtractorFactory.getDescriptionInstance();
+ }
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java
----------------------------------------------------------------------
diff --git a/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java
new file mode 100644
index 0000000..1c86c62
--- /dev/null
+++ b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.plugin.extractor.openie;
+
+import java.util.Arrays;
+
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.SimpleExtractorFactory;
+import org.apache.any23.rdf.Prefixes;
+
+/**
+ * @author lewismc
+ *
+ */
+public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor>
+ implements ExtractorFactory<OpenIEExtractor> {
+
+ public static final String NAME = "openie";
+
+ public static final Prefixes prefixes = null;
+
+ private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory();
+
+ public OpenIEExtractorFactory() {
+ super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html");
+ }
+
+ @Override
+ public OpenIEExtractor createExtractor() {
+ return new OpenIEExtractor();
+ }
+
+ public static ExtractorDescription getDescriptionInstance() {
+ return descriptionInstance;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
----------------------------------------------------------------------
diff --git a/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
new file mode 100644
index 0000000..10ebf16
--- /dev/null
+++ b/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
@@ -0,0 +1 @@
+org.apache.any23.plugin.extractor.openie.OpenIEExtractorFactory
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
----------------------------------------------------------------------
diff --git a/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java b/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
new file mode 100644
index 0000000..dcc4e8f
--- /dev/null
+++ b/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.openie;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.ExtractionResultImpl;
+import org.apache.any23.plugin.extractor.openie.OpenIEExtractor;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.util.StreamUtils;
+import org.apache.any23.writer.RDFXMLWriter;
+import org.apache.any23.writer.TripleHandler;
+import org.apache.any23.writer.TripleHandlerException;
+import org.eclipse.rdf4j.model.IRI;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author lewismc
+ *
+ */
+public class OpenIEExtractorTest {
+
+ private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class);
+
+ private OpenIEExtractor extractor;
+
+ @Before
+ public void setUp() throws Exception {
+ extractor = new OpenIEExtractor();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ extractor = null;
+ }
+
+ @Test
+ public void testExtractFromHTMLDocument()
+ throws IOException, ExtractionException, TripleHandlerException {
+ final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/aquarius");
+ extract(uri, "/org/apache/any23/extractor/openie/example-openie.html");
+ }
+
+ public void extract(IRI uri, String filePath)
+ throws IOException, ExtractionException, TripleHandlerException {
+ FileOutputStream fos = new FileOutputStream(File.createTempFile("OpenIEExtractorTest", "tmp"));
+ final TripleHandler tHandler = new RDFXMLWriter(fos);
+ final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri);
+ final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler);
+ try {
+ extractor.run(
+ ExtractionParameters.newDefault(),
+ extractionContext,
+ StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)),
+ result
+ );
+ } finally {
+ logger.debug(fos.toString());
+ tHandler.close();
+ result.close();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 9f69936..df1059e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -204,10 +204,10 @@
<module>encoding</module>
<module>core</module>
<module>cli</module>
- <module>openie</module>
<module>plugins/basic-crawler</module>
<module>plugins/html-scraper</module>
<module>plugins/office-scraper</module>
+ <module>plugins/openie</module>
<module>plugins/integration-test</module>
<module>service</module>
</modules>
@@ -248,6 +248,8 @@
<semargl.version>0.7</semargl.version>
<slf4j.logger.version>1.7.25</slf4j.logger.version>
<tika.version>1.17</tika.version>
+ <openie_2.11.version>4.2.6</openie_2.11.version>
+ <openregex.version>1.1.1</openregex.version>
<!-- Overridden in profiles to add JDK specific arguments to surefire -->
<surefire-extra-args />
@@ -270,7 +272,7 @@
<buildnumber-maven-plugin.version>1.4</buildnumber-maven-plugin.version>
<maven-compiler-plugin.version>3.6.1</maven-compiler-plugin.version>
<maven-jar-plugin.version>3.0.2</maven-jar-plugin.version>
- <maven-surefire-plugin.version>2.20</maven-surefire-plugin.version>
+ <maven-surefire-plugin.version>2.20.1</maven-surefire-plugin.version>
<jacoco-maven-plugin.version>0.7.9</jacoco-maven-plugin.version>
<maven-site-plugin.version>3.6</maven-site-plugin.version>
<maven-changes-plugin.version>2.12.1</maven-changes-plugin.version>
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/README.md
----------------------------------------------------------------------
diff --git a/service/README.md b/service/README.md
new file mode 100644
index 0000000..0de9b8a
--- /dev/null
+++ b/service/README.md
@@ -0,0 +1,49 @@
+# Any23 Web Service
+
+This is the root dir of the Any23 Web-Service module.
+
+Apache Any23 provides a Web-Service that can be used to extract RDF from Web documents.
+
+## Generate Web-Service Packaging
+
+To generate the desired Web-service package, execute 'mvn package' from this directory.
+
+```
+$ cd $ANY23-HOME/service
+$ mvn package
+```
+From this directory it generates roughly the following...
+```
+.
+├── pom.xml
+├── README.txt
+├── src
+│ ├── main
+│ │ ├── assembly
+│ │ ├── bin
+│ │ ├── java
+│ │ ├── resources
+│ │ └── webapp
+│ └── test
+│ ├── java
+│ └── resources
+└── target
+ ├── any23-service-${version}.war
+ ├── any23-service-${version}-without-deps.war
+ ├── apache-any23-service-${version}-bin-server-embedded.tar.gz <<<
+ ├── apache-any23-service-${version}-bin-server-embedded.zip <<<
+ ├── apache-any23-service-${version}-bin.tar.gz <<<
+ ├── apache-any23-service-${version}-bin-without-deps.tar.gz <<<
+ ├── apache-any23-service-${version}-bin-without-deps.zip <<<
+ ├── apache-any23-service-${version}-bin.zip <<<
+ ├── archive-tmp
+ ├── classes
+ ├── generated-sources
+ ├── maven-archiver
+ ├── maven-shared-archive-resources
+ ├── surefire
+ ├── surefire-reports
+ └── test-classes
+```
+
+Specific README's for each of the artifacts can be found in either ./target/*.tar.gz || ./target/*.zip (annotated above with '<<<'), where much more detailed information sources can be located.
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/README.txt
----------------------------------------------------------------------
diff --git a/service/README.txt b/service/README.txt
deleted file mode 100644
index a4d26d0..0000000
--- a/service/README.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-=============
-Any23 Web Service
-=============
-
-This is the root dir of the Any23 Web-Service module.
-
-Apache Any23 provides a Web-Service that can be used to extract RDF from Web documents.
-
-Generate Web-Service Packaging
-===============================
-
-To generate the desired Web-service package, execute 'mvn package' from this directory.
-
-$cd $ANY23-HOME/service
-$ mvn package
-
-From this directory it generates:
-.
-├── pom.xml
-├── README.txt
-├── src
-│ ├── main
-│ │ ├── assembly
-│ │ ├── bin
-│ │ ├── java
-│ │ ├── resources
-│ │ └── webapp
-│ └── test
-│ ├── java
-│ └── resources
-└── target
- ├── any23-service-${version}.war
- ├── any23-service-${version}-without-deps.war
- ├── apache-any23-service-${version}-bin-server-embedded.tar.gz <<<
- ├── apache-any23-service-${version}-bin-server-embedded.zip <<<
- ├── apache-any23-service-${version}-bin.tar.gz <<<
- ├── apache-any23-service-${version}-bin-without-deps.tar.gz <<<
- ├── apache-any23-service-${version}-bin-without-deps.zip <<<
- ├── apache-any23-service-${version}-bin.zip <<<
- ├── archive-tmp
- ├── classes
- ├── generated-sources
- ├── maven-archiver
- ├── maven-shared-archive-resources
- ├── surefire
- ├── surefire-reports
- └── test-classes
-...
-
-Specific README's can be found in either ./target/*.tar.gz || ./target/*.zip (annotated above with '<<<'), where much more detailed information sources can be located.
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/pom.xml
----------------------------------------------------------------------
diff --git a/service/pom.xml b/service/pom.xml
index fe4911f..d5b275f 100644
--- a/service/pom.xml
+++ b/service/pom.xml
@@ -34,15 +34,41 @@
<properties>
<!-- the following property is used in the bash script as well, don't remove it! -->
<jetty.runner.version>8.1.16.v20140903</jetty.runner.version>
+ <output.directory>${project.build.directory}/${project.artifactId}-${project.version}/WEB-INF/lib/apache-any23-openie</output.directory>
</properties>
<dependencies>
- <!-- Core Module -->
+ <!-- Any23 Modules -->
<dependency>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.any23.plugins</groupId>
+ <artifactId>apache-any23-openie</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>provided</scope>
+ <type>pom</type>
+ </dependency>
+ <dependency>
+ <groupId>edu.washington.cs.knowitall</groupId>
+ <artifactId>openregex</artifactId>
+ <version>${openregex.version}</version>
+ <scope>provided</scope>
+ </dependency>
<!-- Logging -->
<dependency>
@@ -181,6 +207,62 @@
</descriptors>
</configuration>
</plugin>
+
+ <!-- Used to provide dynamic OpenIE toggling within service -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>copy</id>
+ <phase>prepare-package</phase>
+ <goals>
+ <goal>copy</goal>
+ </goals>
+ <configuration>
+ <artifactItems>
+ <artifactItem>
+ <groupId>org.apache.any23.plugins</groupId>
+ <artifactId>apache-any23-openie</artifactId>
+ <version>${project.version}</version>
+ <outputDirectory>${output.directory}</outputDirectory>
+ </artifactItem>
+ <artifactItem>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <outputDirectory>${output.directory}</outputDirectory>
+ </artifactItem>
+ <artifactItem>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <type>pom</type>
+ <outputDirectory>${output.directory}</outputDirectory>
+ </artifactItem>
+ <artifactItem>
+ <groupId>edu.washington.cs.knowitall</groupId>
+ <artifactId>openregex</artifactId>
+ <version>${openregex.version}</version>
+ <outputDirectory>${output.directory}</outputDirectory>
+ </artifactItem>
+ </artifactItems>
+ <!-- other configurations here -->
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <classpathDependencyExcludes>
+ <classpathDependencyExclude>org.apache.any23.plugins:apache-any23-openie</classpathDependencyExclude>
+ </classpathDependencyExcludes>
+ </configuration>
+ </plugin>
+
</plugins>
</build>
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/src/main/java/org/apache/any23/servlet/Servlet.java
----------------------------------------------------------------------
diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java
index b63d052..1ab542c 100644
--- a/service/src/main/java/org/apache/any23/servlet/Servlet.java
+++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java
@@ -20,6 +20,7 @@ package org.apache.any23.servlet;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.http.HTTPClient;
+import org.apache.any23.plugin.Any23PluginManager;
import org.apache.any23.servlet.conneg.Any23Negotiator;
import org.apache.any23.servlet.conneg.MediaRangeSpec;
import org.apache.any23.source.ByteArrayDocumentSource;
@@ -35,6 +36,8 @@ import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
+
+import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.regex.Pattern;
@@ -69,17 +72,43 @@ public class Servlet extends HttpServlet {
final String format = getFormatFromRequestOrNegotiation(req);
final boolean report = isReport(req);
final boolean annotate = isAnnotated(req);
+ final boolean openie = isOpenIE(req);
if (format == null) {
- responder.sendError(406, "Client accept header does not include a supported output format", report);
- return;
+ try {
+ responder.sendError(406, "Client accept header does not include a supported output format", report);
+ return;
+ } catch (IOException e) {
+ LOG.error("Unable to send error for null request format.", e);
+ }
}
final String uri = getInputIRIFromRequest(req);
if (uri == null) {
- responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report);
- return;
+ try {
+ responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report);
+ return;
+ } catch (Exception e) {
+ LOG.error("Unable to send error for null request IRI.", e);
+ }
+ }
+ if (openie) {
+ Any23PluginManager pManager = Any23PluginManager.getInstance();
+ //Dynamically adding Jar's to the Classpath via the following logic
+ //is absolutely dependant on the 'apache-any23-openie' directory being
+ //present within the webapp /lib directory. This is specified within
+ //the maven-dependency-plugin.
+ File webappClasspath = new File(getClass().getClassLoader().getResource("").getPath());
+ File openIEJarPath = new File(webappClasspath.getParentFile().getPath() + "/lib/apache-any23-openie");
+ boolean loadedJars = pManager.loadJARDir(openIEJarPath);
+ if (loadedJars) {
+ LOG.info("Successful dynamic classloading of apache-any23-openie directory from webapp lib.");
+ }
}
final ExtractionParameters eps = getExtractionParameters(req);
- responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
+ try {
+ responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
+ } catch (IOException e) {
+ LOG.error("Unable to run extraction on HTTPDocumentSource.", e);
+ }
}
@Override
@@ -87,6 +116,7 @@ public class Servlet extends HttpServlet {
final WebResponder responder = new WebResponder(this, resp);
final boolean report = isReport(req);
final boolean annotate = isAnnotated(req);
+ final boolean openie = isOpenIE(req);
if (req.getContentType() == null) {
responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
return;
@@ -97,6 +127,10 @@ public class Servlet extends HttpServlet {
responder.sendError(406, "Client accept header does not include a supported output format", report);
return;
}
+ if (openie) {
+ Any23PluginManager pManager = Any23PluginManager.getInstance();
+ pManager.loadJARDir(new File(getClass().getResource("apache-any23-openie").getPath()));
+ }
final ExtractionParameters eps = getExtractionParameters(req);
if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
if (uri != null) {
@@ -283,4 +317,8 @@ public class Servlet extends HttpServlet {
return request.getParameter("annotate") != null;
}
+ private boolean isOpenIE(HttpServletRequest request) {
+ return request.getParameter("openie") != null;
+ }
+
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/src/main/resources/form.html
----------------------------------------------------------------------
diff --git a/service/src/main/resources/form.html b/service/src/main/resources/form.html
index 374d017..a5cf937 100644
--- a/service/src/main/resources/form.html
+++ b/service/src/main/resources/form.html
@@ -80,6 +80,15 @@ function showModal( id )
</div>
</div>
<div class="control-group">
+ <label class="control-label" for="url-openie-get">OpenIE</label>
+ <div class="controls">
+ <label class="checkbox">
+ <input id="url-openie-get" type="checkbox" name="openie">
+ <a href="javascript:showModal( '#sPopup-openie' );">[?]</a>
+ </label>
+ </div>
+ </div>
+ <div class="control-group">
<label class="control-label" for="url-validation-get">Validation</label>
<div class="controls">
<select id="url-validation-get" name="validation-mode" onclick="if(document.getElementById('url-validation-get').value.indexOf('validate') == 0) { document.getElementById('url-report-get').checked = true; }">
@@ -103,7 +112,7 @@ function showModal( id )
<label class="control-label" for="url-annotate-get">Annotate</label>
<div class="controls">
<label class="checkbox">
- <input id="url-annotate-get" type="checkbox" type="checkbox" name="annotate">
+ <input id="url-annotate-get" type="checkbox" name="annotate">
<a href="javascript:showModal( '#sPopup-annotate' );">[?]</a>
</label>
</div>
@@ -149,6 +158,15 @@ function showModal( id )
</div>
</div>
<div class="control-group">
+ <label class="control-label" for="openie-on-post">OpenIE</label>
+ <div class="controls">
+ <label class="checkbox">
+ <input id="url-openie-post" type="checkbox" name="openie">
+ <a href="javascript:showModal( '#sPopup-openie' );">[?]</a>
+ </label>
+ </div>
+ </div>
+ <div class="control-group">
<label class="control-label" for="url-validation-post">Validation</label>
<div class="controls">
<select id="url-validation-post" name="validation-mode" onclick="if(document.getElementById('url-validation-post').value.indexOf('validate') == 0) { document.getElementById('url-report-post').checked = true; }">
@@ -172,7 +190,7 @@ function showModal( id )
<label class="control-label" for="url-annotate-post">Annotate</label>
<div class="controls">
<label class="checkbox">
- <input id="url-annotate-post" type="checkbox" type="checkbox" name="annotate">
+ <input id="url-annotate-post" type="checkbox" name="annotate">
<a href="javascript:showModal( '#sPopup-annotate' );">[?]</a>
</label>
</div>
@@ -224,8 +242,10 @@ function showModal( id )
</tr>
<tr><th>annotate</th><td>If specified the output RDF will contain extractor specific scope comments.<br/>Possible values: <code>on</code>/<code>off</code></td></tr>
<tr><th>report</th><td>If specified will produce a full XML report containing extraction and validation issues other than produced metadata.<br/>Possible values: <code>on</code>/<code>off</code></td></tr>
+ <tr><th>openie</th><td>If specified the <a href="https://github.com/allenai/openie-standalone" target="_blank">
+ Open Information Extraction (Open IE) system</a> will be activated (default off).<br/>Possible values: <code>on</code>/<code>off</code></td></tr>
</table>
- Such URL will return an HTTP <i>302</i> redirect to <code><span class="app-base-uri">http://...</span>any23/<em>format</em></code>.<br/>
+ Formatting the URL according to the above will return an HTTP <i>302</i> redirect to <code><span class="app-base-uri">http://...</span>any23/<em>format</em></code>.<br/>
<p>The response is the input document converted to the desired output format.</p>
<h3>Direct POST API</h3>
@@ -278,6 +298,8 @@ Content-Length: 174
</tr>
<tr><th>annotate</th><td>If specified the output RDF will contain extractor specific scope comments.<br/>Possible values: <code>on</code>/<code>off</code></td></tr>
<tr><th>report</th><td>If specified will produce a full XML report containing extraction and validation issues other than produced metadata.<br/>Possible values: <code>on</code>/<code>off</code></td></tr>
+ <tr><th>openie</th><td>If specified the <a href="https://github.com/allenai/openie-standalone" target="_blank">
+ Open Information Extraction (Open IE) system</a> will be activated (default off).<br/>Possible values: <code>on</code>/<code>off</code></td></tr>
</table>
<h3>Output formats</h3>
@@ -285,11 +307,11 @@ Content-Length: 174
<ul>
<li><code>best</code> for content negotiation according to the client's <code>Accept</code> HTTP header</li>
<li><code>turtle</code>, <code>ttl</code>, <code>n3</code> for
- <a href="http://www.w3.org/TeamSubmission/turtle/" target="_blank">Turtle</a>/<a href="http://www.w3.org/DesignIssues/Notation3" target="_blank">N3</a></li>
+ <a href="https://www.w3.org/TR/turtle/" target="_blank">Turtle</a>/<a href="https://www.w3.org/TeamSubmission/n3/" target="_blank">N3</a></li>
<li><code>ntriples</code>, <code>nt</code> for
- <a href="http://www.w3.org/TR/rdf-testcases/#ntriples" target="_blank">N-Triples</a></li>
+ <a href="https://www.w3.org/TR/n-triples/" target="_blank">N-Triples</a></li>
<li><code>nquads</code>, <code>nq</code> for
- <a href="http://sw.deri.org/2008/07/n-quads/" target="_blank">N-Quads</a></li>
+ <a href="https://www.w3.org/TR/n-quads/" target="_blank">N-Quads</a></li>
<li><code>trix</code> for
<a href="http://www.w3.org/2004/03/trix/" target="_blank">TriX</a></li>
<li><code>rdfxml</code>, <code>rdf</code>, <code>xml</code> for
@@ -323,6 +345,27 @@ Content-Length: 174
<p><b>Apache Any23 v.${project.version} (${implementation.build.tstamp})</b></p>
<p><a href="http://any23.apache.org/" target="_blank">Any23 project homepage</a> | Hosted at <a href="http://apache.org/" target="_blank">Apache Software Foundation</a></p>
+ <div id="sPopup-openie" class="modal hide fade">
+ <div class="modal-header">
+ <button type="button" class="close">×</button>
+ <h3>Open Information Extraction</h3>
+ </div>
+ <div class="modal-body">
+ <p>
+ If the <i>OpenIE</i> checkbox is selected, the <b>Any23</b> service will activate the
+ <a href="https://github.com/allenai/openie-standalone" target="_blank">Open Information Extraction (Open IE) system</a>,
+ enhancing extraction results.</p>
+ <p>The Open IE system runs over sentences and creates extractions that represent relations in text, in the case
+ of Any23, this results in triples. The confidence of relationships extracted from text are based on a
+ configurable threshold established in
+ <code>https://github.com/apache/any23/blob/master/api/src/main/resources/default-configuration.properties</code>.
+ </p>
+ </div>
+ <div class="modal-footer">
+ <a href="#" class="btn">Close</a>
+ </div>
+ </div>
+
<div id="sPopup-fix" class="modal hide fade">
<div class="modal-header">
<button type="button" class="close" >×</button>
@@ -330,8 +373,8 @@ Content-Length: 174
</div>
<div class="modal-body">
<p>
- The <b>Any23</b> service tries to fix some <a href="http://www.deri.ie/fileadmin/documents/DERI-TR-2009-07-28.pdf" target="_blank">common issues</a>
- before performing a metadata extraction. The fixing is performed according a set of fully customizable rules.
+ The <b>Any23</b> service tries to fix some common issues before performing a metadata
+ extraction. The fixing is performed according a set of fully customizable rules.
</p>
<p>
The following <i>Validation</i> options are available.
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/src/main/webapp/resources/js/bootstrap-modal.js
----------------------------------------------------------------------
diff --git a/service/src/main/webapp/resources/js/bootstrap-modal.js b/service/src/main/webapp/resources/js/bootstrap-modal.js
index 38fd0c8..11b951e 100644
--- a/service/src/main/webapp/resources/js/bootstrap-modal.js
+++ b/service/src/main/webapp/resources/js/bootstrap-modal.js
@@ -17,11 +17,9 @@
* limitations under the License.
* ========================================================= */
-
!function ($) {
- "use strict"; // jshint ;_;
-
+ "use strict";
/* MODAL CLASS DEFINITION
* ====================== */
@@ -46,7 +44,8 @@
this.$element.trigger(e)
- if (this.isShown || e.isDefaultPrevented()) return
+ if (this.isShown || e.isDefaultPrevented())
+ return
$('body').addClass('modal-open')
@@ -85,7 +84,8 @@
this.$element.trigger(e)
- if (!this.isShown || e.isDefaultPrevented()) return
+ if (!this.isShown || e.isDefaultPrevented())
+ return
this.isShown = false
@@ -141,7 +141,8 @@
this.$backdrop.click($.proxy(this.hide, this))
}
- if (doAnimate) this.$backdrop[0].offsetWidth // force reflow
+ if (doAnimate)
+ this.$backdrop[0].offsetWidth // force reflow
this.$backdrop.addClass('in')
@@ -186,9 +187,12 @@
var $this = $(this)
, data = $this.data('modal')
, options = $.extend({}, $.fn.modal.defaults, $this.data(), typeof option == 'object' && option)
- if (!data) $this.data('modal', (data = new Modal(this, options)))
- if (typeof option == 'string') data[option]()
- else if (options.show) data.show()
+ if (!data)
+ $this.data('modal', (data = new Modal(this, options)))
+ if (typeof option == 'string')
+ data[option]()
+ else if (options.show)
+ data.show()
})
}
http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/src/site/apt/any23-plugins.apt
----------------------------------------------------------------------
diff --git a/src/site/apt/any23-plugins.apt b/src/site/apt/any23-plugins.apt
index f429e2d..b79a27a 100644
--- a/src/site/apt/any23-plugins.apt
+++ b/src/site/apt/any23-plugins.apt
@@ -49,11 +49,10 @@ export CLASSPATH_PREFIX=../../../plugins/basic-crawler/target/any23-basic-crawle
* adding its <JAR> to the <$HOME/.any23/plugins> directory.
- A plugin can be added to the <Apache Any23 library API> by using the
- {{{./apidocs/org/apache/any23/plugin/Any23PluginManager.html}Any23PluginManager}}#createInstance(Configuration configuration, File... pluginLocations)
- method.
-
- TODO: plugin support in Apache Any23 Service
+ A plugin can be added to the <Apache Any23 library API> by first creating a static instance of
+ {{{./apidocs/org/apache/any23/plugin/Any23PluginManager.html}Any23PluginManager}}#getInstance().
+ Once this is done there is a variety of options to configure and register a plugins, etc. An example
+ of dynamic plugin loading can be seen via the OpenIE toggle in the Any23 Service.
Any implementation of <ExtractorPlugin> will automatically registered to the
{{{./apidocs/org/apache/any23/extractor/ExtractorRegistry.html}ExtractorRegistry}}.
[04/10] any23 git commit: Merge branch 'master' into ANy23-321
Posted by le...@apache.org.
Merge branch 'master' into ANy23-321
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/4f40fe02
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/4f40fe02
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/4f40fe02
Branch: refs/heads/master
Commit: 4f40fe0214ecaabb3c3d20cb44d425b196d0f782
Parents: 6660ed8 f36c5e1
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Mon Jan 8 09:42:26 2018 -0500
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Mon Jan 8 09:42:26 2018 -0500
----------------------------------------------------------------------
src/site/site.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
[08/10] any23 git commit: ANY23-321 Add openie toggle functionality
to service
Posted by le...@apache.org.
ANY23-321 Add openie toggle functionality to service
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/b3806d3c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/b3806d3c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/b3806d3c
Branch: refs/heads/master
Commit: b3806d3c86422831fc08ac9068fac5984b772399
Parents: 69109f3
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Fri Feb 23 17:56:20 2018 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Fri Feb 23 17:56:20 2018 -0800
----------------------------------------------------------------------
service/pom.xml | 32 +++-----------------------------
1 file changed, 3 insertions(+), 29 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/b3806d3c/service/pom.xml
----------------------------------------------------------------------
diff --git a/service/pom.xml b/service/pom.xml
index d5b275f..b59706b 100644
--- a/service/pom.xml
+++ b/service/pom.xml
@@ -217,37 +217,11 @@
<id>copy</id>
<phase>prepare-package</phase>
<goals>
- <goal>copy</goal>
+ <goal>copy-dependencies</goal>
</goals>
<configuration>
- <artifactItems>
- <artifactItem>
- <groupId>org.apache.any23.plugins</groupId>
- <artifactId>apache-any23-openie</artifactId>
- <version>${project.version}</version>
- <outputDirectory>${output.directory}</outputDirectory>
- </artifactItem>
- <artifactItem>
- <groupId>org.allenai.openie</groupId>
- <artifactId>openie_2.11</artifactId>
- <version>${openie_2.11.version}</version>
- <outputDirectory>${output.directory}</outputDirectory>
- </artifactItem>
- <artifactItem>
- <groupId>org.allenai.openie</groupId>
- <artifactId>openie_2.11</artifactId>
- <version>${openie_2.11.version}</version>
- <type>pom</type>
- <outputDirectory>${output.directory}</outputDirectory>
- </artifactItem>
- <artifactItem>
- <groupId>edu.washington.cs.knowitall</groupId>
- <artifactId>openregex</artifactId>
- <version>${openregex.version}</version>
- <outputDirectory>${output.directory}</outputDirectory>
- </artifactItem>
- </artifactItems>
- <!-- other configurations here -->
+ <includeScope>provided</includeScope>
+ <outputDirectory>${output.directory}</outputDirectory>
</configuration>
</execution>
</executions>
[05/10] any23 git commit: ANY23-321 Add openie toggle functionality
to service
Posted by le...@apache.org.
ANY23-321 Add openie toggle functionality to service
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/63ffc9e3
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/63ffc9e3
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/63ffc9e3
Branch: refs/heads/master
Commit: 63ffc9e3e8a8da0b4af7ca5b227f1e199e545227
Parents: 4f40fe0 482e780
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Fri Feb 2 21:55:12 2018 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Fri Feb 2 21:55:12 2018 -0800
----------------------------------------------------------------------
LICENSE-header.txt | 41 --
NOTICE.txt | 2 +-
RELEASE-NOTES.txt | 66 ++
api/pom.xml | 2 +-
.../main/java/org/apache/any23/vocab/YAML.java | 13 +-
.../resources/default-configuration.properties | 4 +
cli/pom.xml | 2 +-
.../org/apache/any23/cli/SimpleRoverTest.java | 13 +-
.../org/apache/any23/cli/YAMLRoverTest.java | 13 +-
cli/src/test/resources/log4j.properties | 15 +-
core/pom.xml | 6 +-
.../extractor/html/EmbeddedJSONLDExtractor.java | 8 +-
.../any23/extractor/html/HCardExtractor.java | 3 +-
.../any23/extractor/html/HTMLMetaExtractor.java | 6 +-
.../apache/any23/extractor/html/JsoupUtils.java | 103 ++++
.../any23/extractor/html/TagSoupParser.java | 172 +++---
.../html/TagSoupParsingConfiguration.java | 181 ++++++
.../any23/extractor/rdf/BaseRDFExtractor.java | 46 +-
.../html/EmbeddedJSONLDExtractorTest.java | 14 +
.../microdata/MicrodataParserTest.java | 5 +-
.../extractor/rdfa/RDFa11ExtractorTest.java | 41 +-
csvutils/pom.xml | 2 +-
encoding/pom.xml | 2 +-
mime/pom.xml | 2 +-
plugins/basic-crawler/pom.xml | 10 +-
plugins/html-scraper/pom.xml | 4 +-
plugins/integration-test/pom.xml | 2 +-
plugins/office-scraper/pom.xml | 4 +-
plugins/openie/pom.xml | 2 +-
pom.xml | 14 +-
service/pom.xml | 2 +-
.../main/assembly/NOTICE-server-embedded.txt | 2 +-
service/src/main/assembly/NOTICE-with-deps.txt | 2 +-
.../src/main/assembly/NOTICE-without-deps.txt | 2 +-
test-resources/pom.xml | 2 +-
.../html-body-embedded-jsonld-extractor.html | 37 ++
...head-and-body-embedded-jsonld-extractor.html | 47 ++
.../test/resources/html/rdfa/rdfa-issue227.html | 40 ++
.../html/rdfa/rdfa-issue268-and-317.html | 613 +++++++++++++++++++
.../html/rdfa/rdfa-issue271-and-317.html | 251 ++++++++
.../html/rdfa/rdfa-issue273-and-317.html | 143 +++++
.../html/rdfa/rdfa-issue326-and-267.html | 20 +
42 files changed, 1777 insertions(+), 182 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/63ffc9e3/plugins/integration-test/pom.xml
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/63ffc9e3/plugins/openie/pom.xml
----------------------------------------------------------------------
diff --cc plugins/openie/pom.xml
index 64c6806,0000000..0f34ad5
mode 100644,000000..100644
--- a/plugins/openie/pom.xml
+++ b/plugins/openie/pom.xml
@@@ -1,165 -1,0 +1,165 @@@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23</artifactId>
- <version>2.2-SNAPSHOT</version>
++ <version>2.3-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+
+ <groupId>org.apache.any23.plugins</groupId>
+ <artifactId>apache-any23-openie</artifactId>
+
+ <name>Apache Any23 :: Plugins :: OpenIE</name>
+ <description>Open Information Extraction module.</description>
+
+ <repositories>
+ <repository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </repository>
+ </repositories>
+ <pluginRepositories>
+ <pluginRepository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray-plugins</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </pluginRepository>
+ </pluginRepositories>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23-test-resources</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>compile</scope>
+ <type>pom</type>
+ </dependency>
+ <dependency>
+ <groupId>edu.washington.cs.knowitall</groupId>
+ <artifactId>openregex</artifactId>
+ <version>${openregex.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <skipTests>true</skipTests>
+ </configuration>
+ </plugin>
+ <!-- Generates the distribution package -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <appendAssemblyId>false</appendAssemblyId>
+ <descriptors>
+ <descriptor>${basedir}/src/main/assembly/bin.xml</descriptor>
+ </descriptors>
+ </configuration>
+ </plugin>
+ </plugins>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>${maven-assembly-plugin.version}</version>
+ <executions>
+ <execution>
+ <id>assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <attach>true</attach>
+ <skipAssembly>true</skipAssembly>
+ <tarLongFileMode>gnu</tarLongFileMode>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>release</id>
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </build>
+ </profile>
+
+ </profiles>
+
+</project>
http://git-wip-us.apache.org/repos/asf/any23/blob/63ffc9e3/pom.xml
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/63ffc9e3/service/pom.xml
----------------------------------------------------------------------
[02/10] any23 git commit: Merge branch 'master' into ANY23-321
Posted by le...@apache.org.
Merge branch 'master' into ANY23-321
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/b71142f1
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/b71142f1
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/b71142f1
Branch: refs/heads/master
Commit: b71142f14179d0feee53dde32f570e98fa9edbdc
Parents: 706e891 d2ace9c
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Jan 3 00:16:07 2018 +0000
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Jan 3 00:16:07 2018 +0000
----------------------------------------------------------------------
api/src/main/java/org/apache/any23/vocab/YAML.java | 7 +++++--
.../apache/any23/extractor/yaml/ElementsProcessor.java | 6 ++----
core/src/main/java/org/apache/any23/rdf/RDFUtils.java | 10 ++++------
.../any23/extractor/yaml/ElementsProcessorTest.java | 1 -
.../apache/any23/extractor/yaml/YAMLExtractorTest.java | 7 +------
.../java/org/apache/any23/vocab/RDFSchemaUtilsTest.java | 4 ++--
.../org/apache/any23/extractor/yaml/test-null.yml | 4 +++-
7 files changed, 17 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
[07/10] any23 git commit: ANY23-321 Add openie toggle functionality
to service
Posted by le...@apache.org.
ANY23-321 Add openie toggle functionality to service
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/69109f36
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/69109f36
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/69109f36
Branch: refs/heads/master
Commit: 69109f36a2ee279c5f031423e16af6b49ea8dbfd
Parents: 073190b
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Fri Feb 23 09:58:54 2018 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Fri Feb 23 09:58:54 2018 -0800
----------------------------------------------------------------------
.../src/main/java/org/apache/any23/servlet/Servlet.java | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/69109f36/service/src/main/java/org/apache/any23/servlet/Servlet.java
----------------------------------------------------------------------
diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java
index 1ab542c..154f41d 100644
--- a/service/src/main/java/org/apache/any23/servlet/Servlet.java
+++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java
@@ -19,6 +19,8 @@ package org.apache.any23.servlet;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractorRegistry;
+import org.apache.any23.extractor.ExtractorRegistryImpl;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.plugin.Any23PluginManager;
import org.apache.any23.servlet.conneg.Any23Negotiator;
@@ -100,7 +102,13 @@ public class Servlet extends HttpServlet {
File openIEJarPath = new File(webappClasspath.getParentFile().getPath() + "/lib/apache-any23-openie");
boolean loadedJars = pManager.loadJARDir(openIEJarPath);
if (loadedJars) {
- LOG.info("Successful dynamic classloading of apache-any23-openie directory from webapp lib.");
+ ExtractorRegistry r = ExtractorRegistryImpl.getInstance();
+ try {
+ pManager.getExtractors().forEachRemaining(r::register);
+ } catch (IOException e) {
+ LOG.error("Error during dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString(), e);
+ }
+ LOG.info("Successful dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString());
}
}
final ExtractionParameters eps = getExtractionParameters(req);
[09/10] any23 git commit: ANY23-321 Add openie toggle functionality
to service
Posted by le...@apache.org.
ANY23-321 Add openie toggle functionality to service
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/71bf171a
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/71bf171a
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/71bf171a
Branch: refs/heads/master
Commit: 71bf171a3b64fbb0388373aabad645e812bf3a0f
Parents: b3806d3
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Tue Feb 27 10:11:57 2018 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Tue Feb 27 10:11:57 2018 -0800
----------------------------------------------------------------------
.../plugin/extractor/openie/OpenIEExtractor.java | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/71bf171a/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java
----------------------------------------------------------------------
diff --git a/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java
index 1b6a9cf..d02b5a2 100644
--- a/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java
+++ b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java
@@ -23,13 +23,12 @@ import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactoryConfigurationError;
import org.apache.any23.extractor.Extractor;
+import org.apache.any23.extractor.IssueReport;
import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractorDescription;
-import org.apache.any23.extractor.ExtractorFactory;
import org.apache.any23.plugin.Author;
-import org.apache.any23.plugin.ExtractorPlugin;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.util.StreamUtils;
import org.apache.tika.Tika;
@@ -63,7 +62,7 @@ import scala.collection.Seq;
* sentences representing relations in the text.
*/
@Author(name="Lewis John McGibbney (lewismc@apache.org)")
-public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor, ExtractorPlugin {
+public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {
private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);
@@ -106,7 +105,13 @@ public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor, Extractor
LOG.error("Encountered error during OpenIE extraction.", e);
} catch (TikaException e) {
LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
- }
+ } catch (OutOfMemoryError e) {
+ //let the gc do its thing
+ openIE = null;
+ out.notifyIssue(IssueReport.IssueLevel.FATAL, "Not enough memory available to perform OpenIE extraction.", -1, -1);
+ LOG.error("Encountered OutOfMemoryError... increase JVM heap when running OpenIEExtractor.", e);
+ return;
+ }
List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
// for each extraction instance we can obtain a number of extraction elements
@@ -129,9 +134,4 @@ public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor, Extractor
}
}
}
-
- @Override
- public ExtractorFactory<?> getExtractorFactory() {
- return (ExtractorFactory<?>) OpenIEExtractorFactory.getDescriptionInstance();
- }
}
[03/10] any23 git commit: Merge branch 'master' into ANY23-321
Posted by le...@apache.org.
Merge branch 'master' into ANY23-321
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6660ed81
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6660ed81
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6660ed81
Branch: refs/heads/master
Commit: 6660ed8142c57274ce51b40af097e4cabf158d3b
Parents: b71142f 6d0606f
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Mon Jan 8 09:26:05 2018 -0500
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Mon Jan 8 09:26:05 2018 -0500
----------------------------------------------------------------------
.../any23/cli/ExtractorDocumentationTest.java | 2 -
.../java/org/apache/any23/cli/RoverTest.java | 2 -
.../org/apache/any23/cli/SimpleRoverTest.java | 2 -
.../any23/extractor/csv/CSVExtractor.java | 23 +-
.../extractor/html/EmbeddedJSONLDExtractor.java | 363 ++--
.../any23/extractor/html/HTMLMetaExtractor.java | 58 +-
.../apache/any23/extractor/microdata/Item.java | 10 +-
.../extractor/microdata/ItemPropValue.java | 31 +-
.../any23/extractor/microdata/ItemScope.java | 29 +-
.../extractor/microdata/MicrodataExtractor.java | 35 +-
.../extractor/microdata/MicrodataParser.java | 136 +-
.../any23/extractor/xpath/QuadTemplate.java | 1 +
.../any23/extractor/xpath/TemplateObject.java | 39 +-
.../any23/extractor/xpath/TemplateSubject.java | 13 +-
.../any23/extractor/yaml/ElementsProcessor.java | 24 +-
.../any23/rdf/Any23ValueFactoryWrapper.java | 61 +-
.../java/org/apache/any23/rdf/RDFUtils.java | 92 +-
.../XMLValidationReportSerializer.java | 21 +-
.../any23/validator/rule/AboutNotURIRule.java | 1 +
.../validator/rule/MetaNameMisuseRule.java | 1 +
.../org/apache/any23/vocab/RDFSchemaUtils.java | 24 +-
.../any23/extractor/csv/CSVExtractorTest.java | 178 +-
.../html/AbstractExtractorTestCase.java | 1592 ++++++++-------
.../extractor/html/HCardExtractorTest.java | 1852 +++++++++---------
.../extractor/html/HListingExtractorTest.java | 3 -
.../microdata/MicrodataParserTest.java | 81 +-
.../any23/extractor/rdfa/RDFaExtractorTest.java | 2 -
.../TemplateXPathExtractorRuleImplTest.java | 24 +-
.../any23/filter/IgnoreAccidentalRDFaTest.java | 2 +-
.../org/apache/any23/writer/JSONWriterTest.java | 4 -
pom.xml | 2 +-
.../org/apache/any23/servlet/ServletTest.java | 4 -
....2.1-non-normative-example-1-expected.nquads | 8 +-
.../5.2.1-non-normative-example-1.html | 48 +-
....2.1-non-normative-example-2-expected.nquads | 33 +-
.../5.2.1-non-normative-example-2.html | 16 +-
.../microdata-basic-expected.properties | 6 +-
.../resources/microdata/microdata-basic.html | 15 +-
.../microdata-itemref-expected.properties | 20 +-
.../resources/microdata/microdata-itemref.html | 46 +-
.../microdata/microdata-json-serialization.json | 2 +-
.../microdata/microdata-nested-expected.nquads | 19 +-
.../microdata-nested-expected.properties | 4 +-
.../resources/microdata/microdata-nested.html | 33 +-
.../microdata-richsnippet-expected.nquads | 27 +-
45 files changed, 2469 insertions(+), 2520 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/6660ed81/pom.xml
----------------------------------------------------------------------
[10/10] any23 git commit: Merge branch 'master' of
https://git-wip-us.apache.org/repos/asf/any23
Posted by le...@apache.org.
Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/any23
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/394d36a0
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/394d36a0
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/394d36a0
Branch: refs/heads/master
Commit: 394d36a0c15b4a5d07e808603ba108d92ff4df1c
Parents: 71bf171 66ce124
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Tue Feb 27 20:43:05 2018 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Tue Feb 27 20:43:05 2018 -0800
----------------------------------------------------------------------
pom.xml | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/394d36a0/pom.xml
----------------------------------------------------------------------
[06/10] any23 git commit: Merge into master
Posted by le...@apache.org.
Merge into master
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/073190bd
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/073190bd
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/073190bd
Branch: refs/heads/master
Commit: 073190bd7cb948ce1faf5df7dae61eb8257416ce
Parents: 778d05e 63ffc9e
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Fri Feb 23 09:23:10 2018 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Fri Feb 23 09:23:10 2018 -0800
----------------------------------------------------------------------
.../apache/any23/extractor/ExtractorGroup.java | 1 +
.../apache/any23/plugin/Any23PluginManager.java | 23 +--
core/src/main/java/org/apache/any23/Any23.java | 8 +-
.../any23/extractor/ExtractorRegistryImpl.java | 11 +-
openie/pom.xml | 152 -----------------
.../any23/extractor/openie/OpenIEExtractor.java | 130 ---------------
.../openie/OpenIEExtractorFactory.java | 52 ------
.../org.apache.any23.extractor.ExtractorFactory | 1 -
.../any23/openie/OpenIEExtractorTest.java | 88 ----------
.../htmlscraper/HTMLScraperExtractor.java | 12 +-
plugins/integration-test/pom.xml | 5 +
.../java/org/apache/any23/plugin/PluginIT.java | 11 +-
plugins/openie/pom.xml | 165 +++++++++++++++++++
.../extractor/openie/OpenIEExtractor.java | 137 +++++++++++++++
.../openie/OpenIEExtractorFactory.java | 52 ++++++
.../org.apache.any23.extractor.ExtractorFactory | 1 +
.../any23/openie/OpenIEExtractorTest.java | 88 ++++++++++
pom.xml | 6 +-
service/README.md | 49 ++++++
service/README.txt | 50 ------
service/pom.xml | 84 +++++++++-
.../java/org/apache/any23/servlet/Servlet.java | 48 +++++-
service/src/main/resources/form.html | 59 ++++++-
.../main/webapp/resources/js/bootstrap-modal.js | 22 ++-
src/site/apt/any23-plugins.apt | 9 +-
25 files changed, 734 insertions(+), 530 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/073190bd/plugins/integration-test/pom.xml
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/073190bd/plugins/openie/pom.xml
----------------------------------------------------------------------
diff --cc plugins/openie/pom.xml
index 0000000,0f34ad5..64c6806
mode 000000,100644..100644
--- a/plugins/openie/pom.xml
+++ b/plugins/openie/pom.xml
@@@ -1,0 -1,165 +1,165 @@@
+ <?xml version="1.0" encoding="UTF-8"?>
+ <!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23</artifactId>
- <version>2.3-SNAPSHOT</version>
++ <version>2.2-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+
+ <groupId>org.apache.any23.plugins</groupId>
+ <artifactId>apache-any23-openie</artifactId>
+
+ <name>Apache Any23 :: Plugins :: OpenIE</name>
+ <description>Open Information Extraction module.</description>
+
+ <repositories>
+ <repository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </repository>
+ </repositories>
+ <pluginRepositories>
+ <pluginRepository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray-plugins</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </pluginRepository>
+ </pluginRepositories>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.any23</groupId>
+ <artifactId>apache-any23-test-resources</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>${openie_2.11.version}</version>
+ <scope>compile</scope>
+ <type>pom</type>
+ </dependency>
+ <dependency>
+ <groupId>edu.washington.cs.knowitall</groupId>
+ <artifactId>openregex</artifactId>
+ <version>${openregex.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <skipTests>true</skipTests>
+ </configuration>
+ </plugin>
+ <!-- Generates the distribution package -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <appendAssemblyId>false</appendAssemblyId>
+ <descriptors>
+ <descriptor>${basedir}/src/main/assembly/bin.xml</descriptor>
+ </descriptors>
+ </configuration>
+ </plugin>
+ </plugins>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>${maven-assembly-plugin.version}</version>
+ <executions>
+ <execution>
+ <id>assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <attach>true</attach>
+ <skipAssembly>true</skipAssembly>
+ <tarLongFileMode>gnu</tarLongFileMode>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>release</id>
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </build>
+ </profile>
+
+ </profiles>
+
+ </project>
http://git-wip-us.apache.org/repos/asf/any23/blob/073190bd/pom.xml
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/073190bd/service/pom.xml
----------------------------------------------------------------------