You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2017/08/23 20:26:59 UTC
[02/15] any23 git commit: ANY23-304 Add extractor for OpenIE
ANY23-304 Add extractor for OpenIE
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/2ecfbff1
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/2ecfbff1
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/2ecfbff1
Branch: refs/heads/master
Commit: 2ecfbff1dddaf57689b725feddba47c7921f726d
Parents: bc46c72
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu Feb 23 17:26:03 2017 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu Feb 23 17:26:03 2017 -0800
----------------------------------------------------------------------
.../configuration/DefaultConfiguration.java | 23 +-
.../DefaultModifiableConfiguration.java | 4 +-
.../java/org/apache/any23/vocab/Vocabulary.java | 26 +-
.../resources/default-configuration.properties | 4 +
.../extractor/SingleDocumentExtraction.java | 6 +-
.../extractor/html/EmbeddedJSONLDExtractor.java | 4 +-
.../any23/extractor/html/GeoExtractor.java | 7 +-
.../any23/extractor/html/TagSoupParser.java | 2 -
.../any23/extractor/xpath/XPathExtractor.java | 3 +-
.../any23/extractor/yaml/YAMLExtractor.java | 58 +-
.../java/org/apache/any23/rdf/RDFUtils.java | 50 +-
.../java/org/apache/any23/util/StreamUtils.java | 69 +-
.../any23/extractor/yaml/YAMLExtractorTest.java | 1 -
openie/pom.xml | 154 +++++
.../apache/any23/openie/OpenIEExtractor.java | 129 ++++
.../any23/openie/OpenIEExtractorFactory.java | 52 ++
.../any23/openie/OpenIEExtractorTest.java | 87 +++
pom.xml | 1 +
.../any23/extractor/openie/example-openie.html | 638 +++++++++++++++++++
19 files changed, 1230 insertions(+), 88 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java b/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
index 6edaf34..170548e 100644
--- a/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
+++ b/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
@@ -48,6 +48,14 @@ public class DefaultConfiguration implements Configuration {
protected final Properties properties;
+ protected DefaultConfiguration(Properties properties) {
+ this.properties = properties;
+ }
+
+ private DefaultConfiguration() {
+ this( loadDefaultProperties() );
+ }
+
/**
* @return the singleton configuration instance.
* Such instance is unmodifiable.
@@ -74,22 +82,17 @@ public class DefaultConfiguration implements Configuration {
return properties;
}
- protected DefaultConfiguration(Properties properties) {
- this.properties = properties;
- }
-
- private DefaultConfiguration() {
- this( loadDefaultProperties() );
- }
-
+ @Override
public synchronized String[] getProperties() {
return properties.keySet().toArray( new String[properties.size()] );
}
+ @Override
public synchronized boolean defineProperty(String propertyName) {
return properties.containsKey(propertyName);
}
+ @Override
public synchronized String getProperty(String propertyName, String defaultValue) {
final String value = getPropertyValue(propertyName);
if(value == null) {
@@ -98,6 +101,7 @@ public class DefaultConfiguration implements Configuration {
return value;
}
+ @Override
public synchronized String getPropertyOrFail(String propertyName) {
final String propertyValue = getPropertyValue(propertyName);
if(propertyValue == null) {
@@ -111,6 +115,7 @@ public class DefaultConfiguration implements Configuration {
return propertyValue;
}
+ @Override
public synchronized int getPropertyIntOrFail(String propertyName) {
final String value = getPropertyOrFail(propertyName);
final String trimValue = value.trim();
@@ -121,6 +126,7 @@ public class DefaultConfiguration implements Configuration {
}
}
+ @Override
public synchronized boolean getFlagProperty(final String propertyName) {
final String value = getPropertyOrFail(propertyName);
if(value == null) {
@@ -140,6 +146,7 @@ public class DefaultConfiguration implements Configuration {
);
}
+ @Override
public synchronized String getConfigurationDump() {
final String[] defaultProperties = getProperties();
final StringBuilder sb = new StringBuilder();
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java b/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
index 82ceaad..055d39c 100644
--- a/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
+++ b/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
@@ -30,8 +30,10 @@ public class DefaultModifiableConfiguration extends DefaultConfiguration impleme
super(properties);
}
+ @Override
public synchronized String setProperty(String propertyName, String propertyValue) {
- if( ! defineProperty(propertyName) ) throw new IllegalArgumentException(
+ if( ! defineProperty(propertyName) )
+ throw new IllegalArgumentException(
String.format("Property '%s' is not defined in configuration.", propertyName)
);
return (String) properties.setProperty(propertyName, propertyValue);
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/vocab/Vocabulary.java b/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
index 8c8204f..718f514 100644
--- a/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
+++ b/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
@@ -157,8 +157,8 @@ public abstract class Vocabulary {
if(classes == null) {
return new IRI[0];
}
- final Collection<IRI> IRIs = classes.values();
- return IRIs.toArray( new IRI[ IRIs.size() ] );
+ final Collection<IRI> iris = classes.values();
+ return iris.toArray( new IRI[ iris.size() ] );
}
/**
@@ -168,8 +168,8 @@ public abstract class Vocabulary {
if(properties == null) {
return new IRI[0];
}
- final Collection<IRI> IRIs = properties.values();
- return IRIs.toArray( new IRI[ IRIs.size() ] );
+ final Collection<IRI> iris = properties.values();
+ return iris.toArray( new IRI[ iris.size() ] );
}
/**
@@ -197,11 +197,11 @@ public abstract class Vocabulary {
/**
* Creates a IRI.
*
- * @param IRIStr the IRI string
+ * @param iriStr the IRI string
* @return the IRI instance.
*/
- protected IRI createIRI(String IRIStr) {
- return SimpleValueFactory.getInstance().createIRI(IRIStr);
+ protected IRI createIRI(String iriStr) {
+ return SimpleValueFactory.getInstance().createIRI(iriStr);
}
/**
@@ -214,7 +214,7 @@ public abstract class Vocabulary {
protected IRI createClass(String namespace, String resource) {
IRI res = createIRI(namespace, resource);
if(classes == null) {
- classes = new HashMap<String, IRI>(10);
+ classes = new HashMap<>(10);
}
classes.put(resource, res);
return res;
@@ -230,7 +230,7 @@ public abstract class Vocabulary {
protected IRI createProperty(String namespace, String property) {
IRI res = createIRI(namespace, property);
if(properties == null) {
- properties = new HashMap<String, IRI>(10);
+ properties = new HashMap<>(10);
}
properties.put(property, res);
return res;
@@ -248,14 +248,16 @@ public abstract class Vocabulary {
}
private void fillResourceToCommentMap() {
- if(resourceToCommentMap != null) return;
- final Map<IRI,String> newMap = new HashMap<IRI, String>();
+ if(resourceToCommentMap != null)
+ return;
+ final Map<IRI,String> newMap = new HashMap<>();
for (Field field : this.getClass().getFields()) {
try {
final Object value = field.get(this);
if(value instanceof IRI) {
final Comment comment = field.getAnnotation(Comment.class);
- if(comment != null) newMap.put((IRI) value, comment.value());
+ if(comment != null)
+ newMap.put((IRI) value, comment.value());
}
} catch (IllegalAccessException iae) {
throw new RuntimeException("Error while creating resource to comment map.", iae);
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/resources/default-configuration.properties
----------------------------------------------------------------------
diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
index d047a83..4f68586 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -72,3 +72,7 @@ any23.extraction.head.meta=on
# Allows to specify a CSV file separator and comment delimeter
any23.extraction.csv.field=,
any23.extraction.csv.comment=#
+
+# A confidence threshold for the OpenIE extractions
+# Any extractions below this value will not be processed.
+any23.extraction.openie.confidence.threshold=0.5
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index d88edf7..295f4e9 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -231,10 +231,10 @@ public class SingleDocumentExtraction {
log.debug(sb.toString());
}
- final List<ResourceRoot> resourceRoots = new ArrayList<ResourceRoot>();
- final List<PropertyPath> propertyPaths = new ArrayList<PropertyPath>();
+ final List<ResourceRoot> resourceRoots = new ArrayList<>();
+ final List<PropertyPath> propertyPaths = new ArrayList<>();
final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
- new HashMap<String,Collection<IssueReport.Issue>>();
+ new HashMap<>();
// Invoke all extractors.
try {
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
index 818fc98..db58586 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
@@ -56,7 +56,7 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
private IRI profile;
- private Map<String, IRI> prefixes = new HashMap<String, IRI>();
+ private Map<String, IRI> prefixes = new HashMap<>();
private String documentLang;
@@ -137,7 +137,7 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
ExtractionContext extractionContext, ExtractionResult out)
throws IOException, ExtractionException {
List<Node> scriptNodes = DomUtils.findAll(in, "/HTML/HEAD/SCRIPT");
- Set<JSONLDScript> result = new HashSet<JSONLDScript>();
+ Set<JSONLDScript> result = new HashSet<>();
extractor = new JSONLDExtractorFactory().createExtractor();
for (Node jsonldNode : scriptNodes) {
NamedNodeMap attributes = jsonldNode.getAttributes();
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
index d85af79..ed7e5d3 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
@@ -50,7 +50,8 @@ public class GeoExtractor extends EntityBasedMicroformatExtractor {
}
protected boolean extractEntity(Node node, ExtractionResult out) {
- if (null == node) return false;
+ if (null == node)
+ return false;
//try lat & lon
final HTMLDocument document = new HTMLDocument(node);
HTMLDocument.TextField latNode = document.getSingularTextField("latitude" );
@@ -59,13 +60,13 @@ public class GeoExtractor extends EntityBasedMicroformatExtractor {
String lon = lonNode.value();
if ("".equals(lat) || "".equals(lon)) {
String[] both = document.getSingularUrlField("geo").value().split(";");
- if (both.length != 2) return false;
+ if (both.length != 2)
+ return false;
lat = both[0];
lon = both[1];
}
BNode geo = getBlankNodeFor(node);
out.writeTriple(geo, RDF.TYPE, vVCARD.Location);
- final String extractorName = getDescription().getExtractorName();
conditionallyAddStringProperty(
latNode.source(),
geo, vVCARD.latitude , lat
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
index e6eb9cd..9ef72f4 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
@@ -25,8 +25,6 @@ import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.html.parsers.DOMParser;
-import org.eclipse.rdf4j.model.IRI;
-import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
index b04533c..1fe1b02 100644
--- a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
@@ -39,9 +39,10 @@ import java.util.List;
*/
public class XPathExtractor implements Extractor.TagSoupDOMExtractor {
- private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<XPathExtractionRule>();
+ private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<>();
public XPathExtractor() {
+ //default constructor
}
public XPathExtractor(List<XPathExtractionRule> rules) {
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
index 64548f1..19bccd1 100644
--- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
@@ -17,8 +17,6 @@ package org.apache.any23.extractor.yaml;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -29,9 +27,7 @@ import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.rdf.RDFUtils;
-import org.apache.any23.util.StringUtils;
import org.apache.any23.vocab.YAML;
-import org.apache.commons.lang.WordUtils;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Value;
@@ -64,10 +60,10 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
public void run(ExtractionParameters extractionParameters, ExtractionContext context, InputStream in,
ExtractionResult out)
throws IOException, ExtractionException {
- IRI documentURI = context.getDocumentIRI();
- documentRoot = RDFUtils.uri(documentURI.toString() + "root");
+ IRI documentIRI = context.getDocumentIRI();
+ documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
- log.debug("process: {}", documentURI.toString());
+ log.debug("Processing: {}", documentIRI.toString());
out.writeNamespace(vocab.PREFIX, vocab.NS);
out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
@@ -77,10 +73,10 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
// Iterate over page(s)
for (Object p : docIterate) {
- Resource pageNode = YAMLExtractor.this.makeUri("document", documentURI);
+ Resource pageNode = RDFUtils.makeIRI("document", documentIRI, true);
out.writeTriple(documentRoot, vocab.contains, pageNode);
out.writeTriple(pageNode, RDF.TYPE, vocab.document);
- out.writeTriple(pageNode, vocab.contains, buildNode(documentURI, p, out));
+ out.writeTriple(pageNode, vocab.contains, buildNode(documentIRI, p, out));
}
}
@@ -99,9 +95,9 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
if (treeData == null) {
return RDF.NIL;
} else if (treeData instanceof Map) {
- return processMap(fileURI, (Map) treeData, out);
+ return processMap(fileURI, (Map<String, Object>) treeData, out);
} else if (treeData instanceof List) {
- return processList(fileURI, (List) treeData, out);
+ return processList(fileURI, (List<?>) treeData, out);
} else if (treeData instanceof Long) {
return RDFUtils.literal(((Long) treeData));
} else if (treeData instanceof Integer) {
@@ -120,9 +116,9 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
}
private Value processMap(IRI file, Map<String, Object> node, ExtractionResult out) {
- Resource nodeURI = YAMLExtractor.this.makeUri(file);
+ Resource nodeURI = RDFUtils.makeIRI(file);
for (String k : node.keySet()) {
- Resource predicate = makeUri(k, file, false);
+ Resource predicate = RDFUtils.makeIRI(k, file, true);
Value value = buildNode(file, node.get(k), out);
out.writeTriple(nodeURI, RDF.TYPE, vocab.node);
out.writeTriple(nodeURI, (IRI) predicate, value);
@@ -132,13 +128,13 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
return nodeURI;
}
- private Value processList(IRI fileURI, Iterable iter, ExtractionResult out) {
+ private Value processList(IRI fileURI, Iterable<?> iter, ExtractionResult out) {
Resource node = YAMLExtractor.this.makeUri();
out.writeTriple(node, RDF.TYPE, RDF.LIST);
Resource pList = null; // previous RDF iter node
Resource cList = node; // cutternt RDF iter node
- Iterator listIter = iter.iterator();
+ Iterator<?> listIter = iter.iterator();
while (listIter.hasNext()) {
// If previous RDF iter node is given lint with current one
if (pList != null) {
@@ -161,36 +157,4 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
nodeId++;
return bnode;
}
-
- private Resource makeUri(IRI docUri) {
- return makeUri("node", docUri);
- }
-
- private Resource makeUri(String type, IRI docUri) {
- return makeUri(type, docUri, true);
- }
-
- private Resource makeUri(String type, IRI docUri, boolean addId) {
-
- // preprocess string: converts - -> _
- // converts <space>: word1 word2 -> word1Word2
- String newType = StringUtils.implementJavaNaming(type);
-
- String uriString;
- if (docUri.toString().endsWith("/")) {
- uriString = docUri.toString() + newType;
- } else {
- uriString = docUri.toString() + "#" + newType;
- }
-
- if (addId) {
- uriString = uriString + "_" + Integer.toString(nodeId);
- }
-
- Resource node = RDFUtils.uri(uriString);
- if (addId) {
- nodeId++;
- }
- return node;
- }
}
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
index bbfe5ec..f6e3a8c 100644
--- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
+++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
@@ -18,7 +18,9 @@
package org.apache.any23.rdf;
import org.apache.any23.util.MathUtils;
+import org.apache.any23.util.StringUtils;
import org.eclipse.rdf4j.model.BNode;
+import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
@@ -60,6 +62,8 @@ import java.util.Optional;
*/
public class RDFUtils {
+ private static int nodeId = 0;
+
private static final ValueFactory valueFactory = SimpleValueFactory.getInstance();
/**
@@ -71,7 +75,8 @@ public class RDFUtils {
*/
public static String fixAbsoluteIRI(String uri) {
String fixed = fixIRIWithException(uri);
- if (!fixed.matches("[a-zA-Z0-9]+:/.*")) throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri);
+ if (!fixed.matches("[a-zA-Z0-9]+:/.*"))
+ throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri);
// Add trailing slash if org.eclipse.rdf4j.model.IRI has only authority but no path.
if (fixed.matches("https?://[a-zA-Z0-9.-]+(:[0-9+])?")) {
fixed = fixed + "/";
@@ -129,7 +134,8 @@ public class RDFUtils {
* @return the unescaped string.
*/
public static String fixIRIWithException(String unescapedIRI) {
- if (unescapedIRI == null) throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null");
+ if (unescapedIRI == null)
+ throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null");
// Remove starting and ending whitespace
String escapedIRI = unescapedIRI.trim();
@@ -141,7 +147,8 @@ public class RDFUtils {
escapedIRI = escapedIRI.replaceAll("\n", "");
//'Remove starting "\" or '"'
- if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\"")) escapedIRI = escapedIRI.substring(1);
+ if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\""))
+ escapedIRI = escapedIRI.substring(1);
//Remove ending "\" or '"'
if (escapedIRI.endsWith("\\") || escapedIRI.endsWith("\""))
escapedIRI = escapedIRI.substring(0, escapedIRI.length() - 1);
@@ -406,7 +413,8 @@ public class RDFUtils {
* @return a value instance.
*/
public static Value toValue(String s) {
- if ("a".equals(s)) return RDF.TYPE;
+ if ("a".equals(s))
+ return RDF.TYPE;
if (s.matches("[a-z0-9]+:.*")) {
return PopularPrefixes.get().expand(s);
}
@@ -466,7 +474,8 @@ public class RDFUtils {
* @throws IllegalArgumentException if no extension matches.
*/
public static Optional<RDFFormat> getFormatByExtension(String ext) {
- if( ! ext.startsWith(".") ) ext = "." + ext;
+ if( ! ext.startsWith(".") )
+ ext = "." + ext;
return Rio.getParserFormatForFileName(ext);
}
@@ -564,6 +573,37 @@ public class RDFUtils {
}
}
+ public static Resource makeIRI(IRI docUri) {
+ return makeIRI("node", docUri);
+ }
+
+ public static Resource makeIRI(String type, IRI docIRI) {
+ return makeIRI(type, docIRI, false);
+ }
+
+ public static Resource makeIRI(String type, IRI docIRI, boolean addId) {
+
+ // preprocess string: converts - -> _
+ // converts <space>: word1 word2 -> word1Word2
+ String newType = StringUtils.implementJavaNaming(type);
+
+ String iriString;
+ if (docIRI.toString().endsWith("/")) {
+ iriString = docIRI.toString() + newType;
+ } else {
+ iriString = docIRI.toString() + "#" + newType;
+ }
+
+ if (addId) {
+ iriString = iriString + "_" + Integer.toString(nodeId);
+ }
+
+ Resource node = RDFUtils.iri(iriString);
+ if (addId) {
+ nodeId++;
+ }
+ return node;
+ }
private RDFUtils() {}
}
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/util/StreamUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/util/StreamUtils.java b/core/src/main/java/org/apache/any23/util/StreamUtils.java
index 2022f0e..a456655 100644
--- a/core/src/main/java/org/apache/any23/util/StreamUtils.java
+++ b/core/src/main/java/org/apache/any23/util/StreamUtils.java
@@ -17,10 +17,17 @@
package org.apache.any23.util;
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.input.BOMInputStream;
+import org.apache.xerces.impl.io.MalformedByteSequenceException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.xml.sax.SAXException;
import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
@@ -28,6 +35,18 @@ import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.Result;
+import javax.xml.transform.Source;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.TransformerFactoryConfigurationError;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
/**
* Contains general utility functions for handling streams.
*
@@ -93,9 +112,9 @@ public class StreamUtils {
* @return the string content.
* @throws IOException if an error occurs while consuming the <code>is</code> stream.
*/
- public static String asString(InputStream is) throws IOException {
- return asString(is, false);
- }
+ public static String asString(InputStream is) throws IOException {
+ return asString(is, false);
+ }
/**
* Closes the closable interface and reports error if any.
@@ -112,4 +131,48 @@ public class StreamUtils {
}
}
+ /**
+ * Converts a {@link org.w3c.dom.Document} to an
+ * {@link java.io.InputStream}
+ * @throws TransformerFactoryConfigurationError
+ * @throws TransformerConfigurationException
+ */
+ public static InputStream documentToInputStream(Document doc)
+ throws TransformerConfigurationException, TransformerFactoryConfigurationError {
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ Source xmlSource = new DOMSource(doc);
+ Result outputTarget = new StreamResult(outputStream);
+ try {
+ TransformerFactory.newInstance().newTransformer().transform(xmlSource, outputTarget);
+ } catch (TransformerException e) {
+ logger.error("Error during transformation: {}", e);
+ }
+ return new ByteArrayInputStream(outputStream.toByteArray());
+ }
+
+ public static Document inputStreamToDocument(InputStream is) throws MalformedByteSequenceException {
+ DocumentBuilderFactory factory = null;
+ DocumentBuilder builder = null;
+ Document doc = null;
+
+ try {
+ factory = DocumentBuilderFactory.newInstance();
+ builder = factory.newDocumentBuilder();
+ } catch (ParserConfigurationException e) {
+ logger.error("Error converting InputStream to Document: {}", e);
+ }
+
+ try {
+ BOMInputStream bomIn = new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
+ ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE);
+ if (bomIn.hasBOM()) {
+ @SuppressWarnings("unused")
+ int firstNonBOMByte = bomIn.read(); // Skips BOM
+ }
+ doc = builder.parse(bomIn);
+ } catch (SAXException | IOException e) {
+ logger.error("Error converting InputStream to Document: {}", e);
+ }
+ return doc;
+ }
}
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
index 0cf8d14..f2c85ba 100644
--- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
@@ -27,7 +27,6 @@ import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.RDFS;
import org.eclipse.rdf4j.repository.RepositoryResult;
-import org.semarglproject.vocab.XSD;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/pom.xml
----------------------------------------------------------------------
diff --git a/openie/pom.xml b/openie/pom.xml
new file mode 100644
index 0000000..799684d
--- /dev/null
+++ b/openie/pom.xml
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>apache-any23</artifactId>
+ <groupId>org.apache.any23</groupId>
+ <version>2.1-SNAPSHOT</version>
+ <relativePath></relativePath>
+ </parent>
+
+ <repositories>
+ <repository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </repository>
+ </repositories>
+ <pluginRepositories>
+ <pluginRepository>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ <id>bintray-allenai-maven</id>
+ <name>bintray-plugins</name>
+ <url>http://allenai.bintray.com/maven</url>
+ </pluginRepository>
+ </pluginRepositories>
+
+ <artifactId>apache-any23-openie</artifactId>
+
+ <name>Apache Any23 :: OpenIE</name>
+ <description>Open Information Extraction module.</description>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-test-resources</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>4.2.6</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.allenai.openie</groupId>
+ <artifactId>openie_2.11</artifactId>
+ <version>4.2.6</version>
+ <scope>compile</scope>
+ <type>pom</type>
+ </dependency>
+ <dependency>
+ <groupId>edu.washington.cs.knowitall</groupId>
+ <artifactId>openregex</artifactId>
+ <version>1.1.1</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>${maven-assembly-plugin.version}</version>
+ <executions>
+ <execution>
+ <id>assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <attach>true</attach>
+ <skipAssembly>true</skipAssembly>
+ <tarLongFileMode>gnu</tarLongFileMode>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>release</id>
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </build>
+ </profile>
+
+ </profiles>
+
+</project>
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java
----------------------------------------------------------------------
diff --git a/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java
new file mode 100644
index 0000000..b8fda29
--- /dev/null
+++ b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.openie;
+
+import java.io.IOException;
+import java.util.List;
+
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactoryConfigurationError;
+
+import org.apache.any23.extractor.Extractor;
+import org.apache.any23.configuration.Configuration;
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.util.StreamUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Resource;
+import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
+import org.eclipse.rdf4j.model.vocabulary.RDFS;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionResult;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+
+import edu.knowitall.openie.Argument;
+import edu.knowitall.openie.Instance;
+import edu.knowitall.openie.OpenIE;
+import edu.knowitall.tool.parse.ClearParser;
+import edu.knowitall.tool.postag.ClearPostagger;
+import edu.knowitall.tool.srl.ClearSrl;
+import edu.knowitall.tool.tokenize.ClearTokenizer;
+import scala.collection.JavaConversions;
+import scala.collection.Seq;
+
+/**
+ * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a>
+ * extractor able to generate <i>RDF</i> statements from
+ * sentences representing relations in the text.
+ */
+public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {
+
+ private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);
+
+ private IRI documentRoot;
+
+ /**
+ * default constructor
+ */
+ OpenIEExtractor() {
+ // default constructor
+ }
+
+ /**
+ * @see org.apache.any23.extractor.Extractor#getDescription()
+ */
+ @Override
+ public ExtractorDescription getDescription() {
+ return OpenIEExtractorFactory.getDescriptionInstance();
+ }
+
+ @Override
+ public void run(ExtractionParameters extractionParameters,
+ ExtractionContext context, Document in, ExtractionResult out)
+ throws IOException, ExtractionException {
+
+ IRI documentIRI = context.getDocumentIRI();
+ documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
+ out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
+ out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
+ LOG.debug("Processing: {}", documentIRI.toString());
+
+ OpenIE openIE = new OpenIE(
+ new ClearParser(
+ new ClearPostagger(
+ new ClearTokenizer())), new ClearSrl(), false, false);
+
+ Seq<Instance> extractions = null;
+ Tika tika = new Tika();
+ try {
+ extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
+ } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
+ LOG.error("Encountered error during OpenIE extraction.", e);
+ } catch (TikaException e) {
+ LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
+ }
+
+ List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
+ // for each extraction instance we can obtain a number of extraction elements
+ // instance.confidence() - a confidence value for the extraction itself
+ // instance.extr().context() - an optional representation of the context for this extraction
+ // instance.extr().arg1().text() - subject
+ // instance.extr().rel().text() - predicate
+ // instance.extr().arg2s().text() - object
+ for(Instance instance : listExtractions) {
+ final Configuration immutableConf = DefaultConfiguration.singleton();
+ if (instance.confidence() > Double.parseDouble(immutableConf.getProperty("any23.extraction.openie.confidence.threshold", "0.5"))) {
+ List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
+ for(Argument argument : listArg2s) {
+ Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
+ IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
+ Value object = RDFUtils.toValue(argument.text());
+ out.writeTriple(subject, predicate, object);
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java
----------------------------------------------------------------------
diff --git a/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java
new file mode 100644
index 0000000..4a1696a
--- /dev/null
+++ b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.openie;
+
+import java.util.Arrays;
+
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.SimpleExtractorFactory;
+import org.apache.any23.rdf.Prefixes;
+
+/**
+ * @author lewismc
+ *
+ */
+public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor>
+ implements ExtractorFactory<OpenIEExtractor> {
+
+ public static final String NAME = "openie";
+
+ public static final Prefixes prefixes = null;
+
+ private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory();
+
+ public OpenIEExtractorFactory() {
+ super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html");
+ }
+
+ @Override
+ public OpenIEExtractor createExtractor() {
+ return new OpenIEExtractor();
+ }
+
+ public static ExtractorDescription getDescriptionInstance() {
+ return descriptionInstance;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
----------------------------------------------------------------------
diff --git a/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
new file mode 100644
index 0000000..3561bdd
--- /dev/null
+++ b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.openie;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.ExtractionResultImpl;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.util.StreamUtils;
+import org.apache.any23.writer.RDFXMLWriter;
+import org.apache.any23.writer.TripleHandler;
+import org.apache.any23.writer.TripleHandlerException;
+import org.eclipse.rdf4j.model.IRI;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author lewismc
+ *
+ */
+public class OpenIEExtractorTest {
+
+ private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class);
+
+ private OpenIEExtractor extractor;
+
+ @Before
+ public void setUp() throws Exception {
+ extractor = new OpenIEExtractor();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ extractor = null;
+ }
+
+ //@Ignore("This typically results in a JVM crash... disabled for the time being.")
+ @Test
+ public void testExtractFromHTMLDocument()
+ throws IOException, ExtractionException, TripleHandlerException {
+ final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/aquarius");
+ extract(uri, "/org/apache/any23/extractor/openie/example-openie.html");
+ }
+
+ public void extract(IRI uri, String filePath)
+ throws IOException, ExtractionException, TripleHandlerException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ final TripleHandler tHandler = new RDFXMLWriter(baos);
+ final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri);
+ final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler);
+ try {
+ extractor.run(
+ ExtractionParameters.newDefault(),
+ extractionContext,
+ StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)),
+ result
+ );
+ } finally {
+ logger.debug(baos.toString());
+ tHandler.close();
+ result.close();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 23ab57f..fffc7b5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -204,6 +204,7 @@
<module>encoding</module>
<module>core</module>
<module>cli</module>
+ <module>openie</module>
<module>plugins/basic-crawler</module>
<module>plugins/html-scraper</module>
<module>plugins/office-scraper</module>