You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2017/08/23 20:26:59 UTC

[02/15] any23 git commit: ANY23-304 Add extractor for OpenIE

ANY23-304 Add extractor for OpenIE


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/2ecfbff1
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/2ecfbff1
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/2ecfbff1

Branch: refs/heads/master
Commit: 2ecfbff1dddaf57689b725feddba47c7921f726d
Parents: bc46c72
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu Feb 23 17:26:03 2017 -0800
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu Feb 23 17:26:03 2017 -0800

----------------------------------------------------------------------
 .../configuration/DefaultConfiguration.java     |  23 +-
 .../DefaultModifiableConfiguration.java         |   4 +-
 .../java/org/apache/any23/vocab/Vocabulary.java |  26 +-
 .../resources/default-configuration.properties  |   4 +
 .../extractor/SingleDocumentExtraction.java     |   6 +-
 .../extractor/html/EmbeddedJSONLDExtractor.java |   4 +-
 .../any23/extractor/html/GeoExtractor.java      |   7 +-
 .../any23/extractor/html/TagSoupParser.java     |   2 -
 .../any23/extractor/xpath/XPathExtractor.java   |   3 +-
 .../any23/extractor/yaml/YAMLExtractor.java     |  58 +-
 .../java/org/apache/any23/rdf/RDFUtils.java     |  50 +-
 .../java/org/apache/any23/util/StreamUtils.java |  69 +-
 .../any23/extractor/yaml/YAMLExtractorTest.java |   1 -
 openie/pom.xml                                  | 154 +++++
 .../apache/any23/openie/OpenIEExtractor.java    | 129 ++++
 .../any23/openie/OpenIEExtractorFactory.java    |  52 ++
 .../any23/openie/OpenIEExtractorTest.java       |  87 +++
 pom.xml                                         |   1 +
 .../any23/extractor/openie/example-openie.html  | 638 +++++++++++++++++++
 19 files changed, 1230 insertions(+), 88 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java b/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
index 6edaf34..170548e 100644
--- a/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
+++ b/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java
@@ -48,6 +48,14 @@ public class DefaultConfiguration implements Configuration {
 
     protected final Properties properties;
 
+    protected DefaultConfiguration(Properties properties) {
+        this.properties = properties;
+    }
+
+    private DefaultConfiguration() {
+        this( loadDefaultProperties() );
+    }
+
     /**
      * @return the singleton configuration instance.
      *         Such instance is unmodifiable.
@@ -74,22 +82,17 @@ public class DefaultConfiguration implements Configuration {
         return properties;
     }
 
-    protected DefaultConfiguration(Properties properties) {
-        this.properties = properties;
-    }
-
-    private DefaultConfiguration() {
-        this( loadDefaultProperties() );
-    }
-
+    @Override
     public synchronized String[] getProperties() {
         return properties.keySet().toArray( new String[properties.size()] );
     }
 
+    @Override
     public synchronized boolean defineProperty(String propertyName) {
         return properties.containsKey(propertyName);
     }
 
+    @Override
     public synchronized String getProperty(String propertyName, String defaultValue) {
         final String value = getPropertyValue(propertyName);
         if(value == null) {
@@ -98,6 +101,7 @@ public class DefaultConfiguration implements Configuration {
         return value;
     }
 
+    @Override
     public synchronized String getPropertyOrFail(String propertyName) {
         final String propertyValue = getPropertyValue(propertyName);
         if(propertyValue == null) {
@@ -111,6 +115,7 @@ public class DefaultConfiguration implements Configuration {
         return propertyValue;
     }
 
+    @Override
     public synchronized int getPropertyIntOrFail(String propertyName) {
         final String value = getPropertyOrFail(propertyName);
         final String trimValue = value.trim();
@@ -121,6 +126,7 @@ public class DefaultConfiguration implements Configuration {
         }
     }
 
+    @Override
     public synchronized boolean getFlagProperty(final String propertyName) {
         final String value = getPropertyOrFail(propertyName);
         if(value == null) {
@@ -140,6 +146,7 @@ public class DefaultConfiguration implements Configuration {
         );
     }
 
+    @Override
     public synchronized String getConfigurationDump() {
         final String[] defaultProperties = getProperties();
         final StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java b/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
index 82ceaad..055d39c 100644
--- a/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
+++ b/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java
@@ -30,8 +30,10 @@ public class DefaultModifiableConfiguration extends DefaultConfiguration impleme
         super(properties);
     }
 
+    @Override
     public synchronized String setProperty(String propertyName, String propertyValue) {
-        if( ! defineProperty(propertyName) ) throw new IllegalArgumentException(
+        if( ! defineProperty(propertyName) )
+            throw new IllegalArgumentException(
                 String.format("Property '%s' is not defined in configuration.", propertyName)
         );
         return (String) properties.setProperty(propertyName, propertyValue);

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/vocab/Vocabulary.java b/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
index 8c8204f..718f514 100644
--- a/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
+++ b/api/src/main/java/org/apache/any23/vocab/Vocabulary.java
@@ -157,8 +157,8 @@ public abstract class Vocabulary {
         if(classes == null) {
             return new IRI[0];
         }
-        final Collection<IRI> IRIs = classes.values();
-        return IRIs.toArray( new IRI[ IRIs.size() ] );
+        final Collection<IRI> iris = classes.values();
+        return iris.toArray( new IRI[ iris.size() ] );
     }
 
     /**
@@ -168,8 +168,8 @@ public abstract class Vocabulary {
         if(properties == null) {
             return new IRI[0];
         }
-        final Collection<IRI> IRIs = properties.values();
-        return IRIs.toArray( new IRI[ IRIs.size() ] );
+        final Collection<IRI> iris = properties.values();
+        return iris.toArray( new IRI[ iris.size() ] );
     }
 
     /**
@@ -197,11 +197,11 @@ public abstract class Vocabulary {
     /**
      * Creates a IRI.
      *
-     * @param IRIStr the IRI string
+     * @param iriStr the IRI string
      * @return the IRI instance.
      */
-    protected IRI createIRI(String IRIStr) {
-        return SimpleValueFactory.getInstance().createIRI(IRIStr);
+    protected IRI createIRI(String iriStr) {
+        return SimpleValueFactory.getInstance().createIRI(iriStr);
     }
 
     /**
@@ -214,7 +214,7 @@ public abstract class Vocabulary {
     protected IRI createClass(String namespace, String resource) {
         IRI res = createIRI(namespace, resource);
         if(classes == null) {
-            classes = new HashMap<String, IRI>(10);
+            classes = new HashMap<>(10);
         }
         classes.put(resource, res);
         return res;
@@ -230,7 +230,7 @@ public abstract class Vocabulary {
     protected IRI createProperty(String namespace, String property) {
         IRI res = createIRI(namespace, property);
         if(properties == null) {
-            properties = new HashMap<String, IRI>(10);
+            properties = new HashMap<>(10);
         }
         properties.put(property, res);
         return res;
@@ -248,14 +248,16 @@ public abstract class Vocabulary {
     }
 
     private void fillResourceToCommentMap() {
-        if(resourceToCommentMap != null) return;
-        final Map<IRI,String> newMap = new HashMap<IRI, String>();
+        if(resourceToCommentMap != null)
+            return;
+        final Map<IRI,String> newMap = new HashMap<>();
         for (Field field : this.getClass().getFields()) {
             try {
                 final Object value = field.get(this);
                 if(value instanceof IRI) {
                     final Comment comment = field.getAnnotation(Comment.class);
-                    if(comment != null) newMap.put((IRI) value, comment.value());
+                    if(comment != null)
+                        newMap.put((IRI) value, comment.value());
                 }
             } catch (IllegalAccessException iae) {
                 throw new RuntimeException("Error while creating resource to comment map.", iae);

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/resources/default-configuration.properties
----------------------------------------------------------------------
diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
index d047a83..4f68586 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -72,3 +72,7 @@ any23.extraction.head.meta=on
 # Allows to specify a CSV file separator and comment delimeter
 any23.extraction.csv.field=,
 any23.extraction.csv.comment=#
+
+# A confidence threshold for the OpenIE extractions
+# Any extractions below this value will not be processed.
+any23.extraction.openie.confidence.threshold=0.5

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index d88edf7..295f4e9 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -231,10 +231,10 @@ public class SingleDocumentExtraction {
             log.debug(sb.toString());
         }
 
-        final List<ResourceRoot> resourceRoots = new ArrayList<ResourceRoot>();
-        final List<PropertyPath> propertyPaths = new ArrayList<PropertyPath>();
+        final List<ResourceRoot> resourceRoots = new ArrayList<>();
+        final List<PropertyPath> propertyPaths = new ArrayList<>();
         final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
-            new HashMap<String,Collection<IssueReport.Issue>>();
+            new HashMap<>();
         
         // Invoke all extractors.
         try {

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
index 818fc98..db58586 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
@@ -56,7 +56,7 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
 
 	private IRI profile;
 
-	private Map<String, IRI> prefixes = new HashMap<String, IRI>();
+	private Map<String, IRI> prefixes = new HashMap<>();
 
 	private String documentLang;
 
@@ -137,7 +137,7 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
 			ExtractionContext extractionContext, ExtractionResult out)
 			throws IOException, ExtractionException {
 		List<Node> scriptNodes = DomUtils.findAll(in, "/HTML/HEAD/SCRIPT");
-		Set<JSONLDScript> result = new HashSet<JSONLDScript>();
+		Set<JSONLDScript> result = new HashSet<>();
 		extractor = new JSONLDExtractorFactory().createExtractor();
 		for (Node jsonldNode : scriptNodes) {
 			NamedNodeMap attributes = jsonldNode.getAttributes();

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
index d85af79..ed7e5d3 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java
@@ -50,7 +50,8 @@ public class GeoExtractor extends EntityBasedMicroformatExtractor {
     }
 
     protected boolean extractEntity(Node node, ExtractionResult out) {
-        if (null == node) return false;
+        if (null == node)
+            return false;
         //try lat & lon
         final HTMLDocument document = new HTMLDocument(node);
         HTMLDocument.TextField latNode = document.getSingularTextField("latitude" );
@@ -59,13 +60,13 @@ public class GeoExtractor extends EntityBasedMicroformatExtractor {
         String lon = lonNode.value();
         if ("".equals(lat) || "".equals(lon)) {
             String[] both = document.getSingularUrlField("geo").value().split(";");
-            if (both.length != 2) return false;
+            if (both.length != 2)
+                return false;
             lat = both[0];
             lon = both[1];
         }
         BNode geo = getBlankNodeFor(node);
         out.writeTriple(geo, RDF.TYPE, vVCARD.Location);
-        final String extractorName = getDescription().getExtractorName();
         conditionallyAddStringProperty(
                 latNode.source(),
                 geo, vVCARD.latitude , lat

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
index e6eb9cd..9ef72f4 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
@@ -25,8 +25,6 @@ import org.apache.xerces.xni.QName;
 import org.apache.xerces.xni.XMLAttributes;
 import org.apache.xerces.xni.XNIException;
 import org.cyberneko.html.parsers.DOMParser;
-import org.eclipse.rdf4j.model.IRI;
-import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
index b04533c..1fe1b02 100644
--- a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java
@@ -39,9 +39,10 @@ import java.util.List;
  */
 public class XPathExtractor implements Extractor.TagSoupDOMExtractor {
 
-    private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<XPathExtractionRule>();
+    private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<>();
 
     public XPathExtractor() {
+        //default constructor
     }
     
     public XPathExtractor(List<XPathExtractionRule> rules) {

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
index 64548f1..19bccd1 100644
--- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
@@ -17,8 +17,6 @@ package org.apache.any23.extractor.yaml;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -29,9 +27,7 @@ import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.rdf.RDFUtils;
-import org.apache.any23.util.StringUtils;
 import org.apache.any23.vocab.YAML;
-import org.apache.commons.lang.WordUtils;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Value;
@@ -64,10 +60,10 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
     public void run(ExtractionParameters extractionParameters, ExtractionContext context, InputStream in,
             ExtractionResult out)
             throws IOException, ExtractionException {
-        IRI documentURI = context.getDocumentIRI();
-        documentRoot = RDFUtils.uri(documentURI.toString() + "root");
+        IRI documentIRI = context.getDocumentIRI();
+        documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
 
-        log.debug("process: {}", documentURI.toString());
+        log.debug("Processing: {}", documentIRI.toString());
         out.writeNamespace(vocab.PREFIX, vocab.NS);
         out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
         out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
@@ -77,10 +73,10 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
 
         // Iterate over page(s)
         for (Object p : docIterate) {
-            Resource pageNode = YAMLExtractor.this.makeUri("document", documentURI);
+            Resource pageNode = RDFUtils.makeIRI("document", documentIRI, true);
             out.writeTriple(documentRoot, vocab.contains, pageNode);
             out.writeTriple(pageNode, RDF.TYPE, vocab.document);
-            out.writeTriple(pageNode, vocab.contains, buildNode(documentURI, p, out));
+            out.writeTriple(pageNode, vocab.contains, buildNode(documentIRI, p, out));
         }
 
     }
@@ -99,9 +95,9 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
         if (treeData == null) {
             return RDF.NIL;
         } else if (treeData instanceof Map) {
-            return processMap(fileURI, (Map) treeData, out);
+            return processMap(fileURI, (Map<String, Object>) treeData, out);
         } else if (treeData instanceof List) {
-            return processList(fileURI, (List) treeData, out);
+            return processList(fileURI, (List<?>) treeData, out);
         } else if (treeData instanceof Long) {
             return RDFUtils.literal(((Long) treeData));
         } else if (treeData instanceof Integer) {
@@ -120,9 +116,9 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
     }
 
     private Value processMap(IRI file, Map<String, Object> node, ExtractionResult out) {
-        Resource nodeURI = YAMLExtractor.this.makeUri(file);
+        Resource nodeURI = RDFUtils.makeIRI(file);
         for (String k : node.keySet()) {
-            Resource predicate = makeUri(k, file, false);
+            Resource predicate = RDFUtils.makeIRI(k, file, true);
             Value value = buildNode(file, node.get(k), out);
             out.writeTriple(nodeURI, RDF.TYPE, vocab.node);
             out.writeTriple(nodeURI, (IRI) predicate, value);
@@ -132,13 +128,13 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
         return nodeURI;
     }
 
-    private Value processList(IRI fileURI, Iterable iter, ExtractionResult out) {
+    private Value processList(IRI fileURI, Iterable<?> iter, ExtractionResult out) {
         Resource node = YAMLExtractor.this.makeUri();
         out.writeTriple(node, RDF.TYPE, RDF.LIST);
 
         Resource pList = null; // previous RDF iter node
         Resource cList = node; // cutternt RDF iter node
-        Iterator listIter = iter.iterator();
+        Iterator<?> listIter = iter.iterator();
         while (listIter.hasNext()) {
             // If previous RDF iter node is given lint with current one
             if (pList != null) {
@@ -161,36 +157,4 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
         nodeId++;
         return bnode;
     }
-
-    private Resource makeUri(IRI docUri) {
-        return makeUri("node", docUri);
-    }
-
-    private Resource makeUri(String type, IRI docUri) {
-        return makeUri(type, docUri, true);
-    }
-
-    private Resource makeUri(String type, IRI docUri, boolean addId) {
-
-        // preprocess string: converts - -> _
-        //                    converts <space>: word1 word2 -> word1Word2
-        String newType = StringUtils.implementJavaNaming(type);
-
-        String uriString;
-        if (docUri.toString().endsWith("/")) {
-            uriString = docUri.toString() + newType;
-        } else {
-            uriString = docUri.toString() + "#" + newType;
-        }
-
-        if (addId) {
-            uriString = uriString + "_" + Integer.toString(nodeId);
-        }
-
-        Resource node = RDFUtils.uri(uriString);
-        if (addId) {
-            nodeId++;
-        }
-        return node;
-    }
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
index bbfe5ec..f6e3a8c 100644
--- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
+++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
@@ -18,7 +18,9 @@
 package org.apache.any23.rdf;
 
 import org.apache.any23.util.MathUtils;
+import org.apache.any23.util.StringUtils;
 import org.eclipse.rdf4j.model.BNode;
+import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Literal;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.Statement;
@@ -60,6 +62,8 @@ import java.util.Optional;
  */
 public class RDFUtils {
 
+    private static int nodeId = 0;
+
     private static final ValueFactory valueFactory = SimpleValueFactory.getInstance();
 
     /**
@@ -71,7 +75,8 @@ public class RDFUtils {
      */
     public static String fixAbsoluteIRI(String uri) {
         String fixed = fixIRIWithException(uri);
-        if (!fixed.matches("[a-zA-Z0-9]+:/.*")) throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri);
+        if (!fixed.matches("[a-zA-Z0-9]+:/.*"))
+            throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri);
         // Add trailing slash if org.eclipse.rdf4j.model.IRI has only authority but no path.
         if (fixed.matches("https?://[a-zA-Z0-9.-]+(:[0-9+])?")) {
             fixed = fixed + "/";
@@ -129,7 +134,8 @@ public class RDFUtils {
      * @return the unescaped string.
      */
     public static String fixIRIWithException(String unescapedIRI) {
-        if (unescapedIRI == null) throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null");
+        if (unescapedIRI == null)
+            throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null");
 
         //    Remove starting and ending whitespace
         String escapedIRI = unescapedIRI.trim();
@@ -141,7 +147,8 @@ public class RDFUtils {
         escapedIRI = escapedIRI.replaceAll("\n", "");
 
         //'Remove starting  "\" or '"'
-        if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\"")) escapedIRI = escapedIRI.substring(1);
+        if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\""))
+            escapedIRI = escapedIRI.substring(1);
         //Remove  ending   "\" or '"'
         if (escapedIRI.endsWith("\\") || escapedIRI.endsWith("\""))
             escapedIRI = escapedIRI.substring(0, escapedIRI.length() - 1);
@@ -406,7 +413,8 @@ public class RDFUtils {
      * @return a value instance.
      */
     public static Value toValue(String s) {
-        if ("a".equals(s)) return RDF.TYPE;
+        if ("a".equals(s))
+            return RDF.TYPE;
         if (s.matches("[a-z0-9]+:.*")) {
             return PopularPrefixes.get().expand(s);
         }
@@ -466,7 +474,8 @@ public class RDFUtils {
      * @throws IllegalArgumentException if no extension matches.
      */
     public static Optional<RDFFormat> getFormatByExtension(String ext) {
-        if( ! ext.startsWith(".") ) ext = "." + ext;
+        if( ! ext.startsWith(".") )
+            ext = "." + ext;
         return Rio.getParserFormatForFileName(ext);
     }
 
@@ -564,6 +573,37 @@ public class RDFUtils {
         }
     }
 
+    public static Resource makeIRI(IRI docUri) {
+        return makeIRI("node", docUri);
+    }
+
+    public static Resource makeIRI(String type, IRI docIRI) {
+        return makeIRI(type, docIRI, false);
+    }
+
+    public static Resource makeIRI(String type, IRI docIRI, boolean addId) {
+
+        // preprocess string: converts - -> _
+        //                    converts <space>: word1 word2 -> word1Word2
+        String newType = StringUtils.implementJavaNaming(type);
+
+        String iriString;
+        if (docIRI.toString().endsWith("/")) {
+            iriString = docIRI.toString() + newType;
+        } else {
+            iriString = docIRI.toString() + "#" + newType;
+        }
+
+        if (addId) {
+            iriString = iriString + "_" + Integer.toString(nodeId);
+        }
+
+        Resource node = RDFUtils.iri(iriString);
+        if (addId) {
+            nodeId++;
+        }
+        return node;
+    }
     private RDFUtils() {}
 
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/util/StreamUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/util/StreamUtils.java b/core/src/main/java/org/apache/any23/util/StreamUtils.java
index 2022f0e..a456655 100644
--- a/core/src/main/java/org/apache/any23/util/StreamUtils.java
+++ b/core/src/main/java/org/apache/any23/util/StreamUtils.java
@@ -17,10 +17,17 @@
 
 package org.apache.any23.util;
 
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.input.BOMInputStream;
+import org.apache.xerces.impl.io.MalformedByteSequenceException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.xml.sax.SAXException;
 
 import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.Closeable;
 import java.io.IOException;
 import java.io.InputStream;
@@ -28,6 +35,18 @@ import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.List;
 
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.Result;
+import javax.xml.transform.Source;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.TransformerFactoryConfigurationError;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
 /**
  * Contains general utility functions for handling streams.
  *
@@ -93,9 +112,9 @@ public class StreamUtils {
      * @return the string content.
      * @throws IOException if an error occurs while consuming the <code>is</code> stream.
      */
-     public static String asString(InputStream is) throws IOException {
-         return asString(is, false);
-     }
+    public static String asString(InputStream is) throws IOException {
+        return asString(is, false);
+    }
 
     /**
      * Closes the closable interface and reports error if any.
@@ -112,4 +131,48 @@ public class StreamUtils {
         }
     }
 
+    /**
+     * Converts a {@link org.w3c.dom.Document} to an
+     * {@link java.io.InputStream}
+     * @throws TransformerFactoryConfigurationError 
+     * @throws TransformerConfigurationException 
+     */
+    public static InputStream documentToInputStream(Document doc) 
+            throws TransformerConfigurationException, TransformerFactoryConfigurationError {
+        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+        Source xmlSource = new DOMSource(doc);
+        Result outputTarget = new StreamResult(outputStream);
+        try {
+            TransformerFactory.newInstance().newTransformer().transform(xmlSource, outputTarget);
+        } catch (TransformerException e) {
+            logger.error("Error during transformation: {}", e);
+        }
+        return new ByteArrayInputStream(outputStream.toByteArray());
+    }
+
+    public static Document inputStreamToDocument(InputStream is) throws MalformedByteSequenceException {
+        DocumentBuilderFactory factory = null;
+        DocumentBuilder builder = null;
+        Document doc = null;
+
+        try {
+            factory = DocumentBuilderFactory.newInstance();
+            builder = factory.newDocumentBuilder();
+        } catch (ParserConfigurationException e) {
+            logger.error("Error converting InputStream to Document: {}", e);
+        }
+
+        try {
+            BOMInputStream bomIn = new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
+                    ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE);
+            if (bomIn.hasBOM()) {
+                @SuppressWarnings("unused")
+                int firstNonBOMByte = bomIn.read(); // Skips BOM
+            }
+            doc = builder.parse(bomIn);
+        } catch (SAXException | IOException e) {
+            logger.error("Error converting InputStream to Document: {}", e);
+        }
+        return doc;
+    }
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
index 0cf8d14..f2c85ba 100644
--- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
@@ -27,7 +27,6 @@ import org.eclipse.rdf4j.model.Statement;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.eclipse.rdf4j.model.vocabulary.RDFS;
 import org.eclipse.rdf4j.repository.RepositoryResult;
-import org.semarglproject.vocab.XSD;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/pom.xml
----------------------------------------------------------------------
diff --git a/openie/pom.xml b/openie/pom.xml
new file mode 100644
index 0000000..799684d
--- /dev/null
+++ b/openie/pom.xml
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <artifactId>apache-any23</artifactId>
+    <groupId>org.apache.any23</groupId>
+    <version>2.1-SNAPSHOT</version>
+    <relativePath></relativePath>
+  </parent>
+
+  <repositories>
+    <repository>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+      <id>bintray-allenai-maven</id>
+      <name>bintray</name>
+      <url>http://allenai.bintray.com/maven</url>
+    </repository>
+  </repositories>
+  <pluginRepositories>
+    <pluginRepository>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+      <id>bintray-allenai-maven</id>
+      <name>bintray-plugins</name>
+      <url>http://allenai.bintray.com/maven</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <artifactId>apache-any23-openie</artifactId>
+
+  <name>Apache Any23 :: OpenIE</name>
+  <description>Open Information Extraction module.</description>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>apache-any23-test-resources</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+    <dependency>
+      <groupId>org.allenai.openie</groupId>
+      <artifactId>openie_2.11</artifactId>
+      <version>4.2.6</version>
+      <scope>compile</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.allenai.openie</groupId>
+      <artifactId>openie_2.11</artifactId>
+      <version>4.2.6</version>
+      <scope>compile</scope>
+      <type>pom</type>
+    </dependency>
+    <dependency>
+      <groupId>edu.washington.cs.knowitall</groupId>
+      <artifactId>openregex</artifactId>
+      <version>1.1.1</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <resources>
+      <resource>
+        <directory>${basedir}/../</directory>
+        <targetPath>META-INF</targetPath>
+        <includes>
+          <include>LICENSE.txt</include>
+          <include>NOTICE.txt</include>
+        </includes>
+      </resource>
+    </resources>
+    <pluginManagement>
+      <plugins>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-assembly-plugin</artifactId>
+          <version>${maven-assembly-plugin.version}</version>
+          <executions>
+            <execution>
+              <id>assembly</id>
+              <phase>package</phase>
+              <goals>
+                <goal>single</goal>
+              </goals>
+            </execution>
+          </executions>
+          <configuration>
+            <attach>true</attach>
+            <skipAssembly>true</skipAssembly>
+            <tarLongFileMode>gnu</tarLongFileMode>
+          </configuration>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+
+  <profiles>
+    <profile>
+      <id>release</id>
+      <build>
+        <resources>
+          <resource>
+            <directory>${basedir}/../</directory>
+            <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+            <includes>
+              <include>LICENSE.txt</include>
+              <include>NOTICE.txt</include>
+            </includes>
+          </resource>
+        </resources>
+      </build>
+    </profile>
+
+  </profiles>
+
+</project>

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java
----------------------------------------------------------------------
diff --git a/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java
new file mode 100644
index 0000000..b8fda29
--- /dev/null
+++ b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.openie;
+
+import java.io.IOException;
+import java.util.List;
+
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactoryConfigurationError;
+
+import org.apache.any23.extractor.Extractor;
+import org.apache.any23.configuration.Configuration;
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.util.StreamUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Resource;
+import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
+import org.eclipse.rdf4j.model.vocabulary.RDFS;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionResult;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+
+import edu.knowitall.openie.Argument;
+import edu.knowitall.openie.Instance;
+import edu.knowitall.openie.OpenIE;
+import edu.knowitall.tool.parse.ClearParser;
+import edu.knowitall.tool.postag.ClearPostagger;
+import edu.knowitall.tool.srl.ClearSrl;
+import edu.knowitall.tool.tokenize.ClearTokenizer;
+import scala.collection.JavaConversions;
+import scala.collection.Seq;
+
+/**
+ * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a> 
+ * extractor able to generate <i>RDF</i> statements from 
+ * sentences representing relations in the text.
+ */
+public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {
+
+    private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);
+
+    private IRI documentRoot;
+
+    /**
+     * default constructor
+     */
+    OpenIEExtractor() {
+        // default constructor
+    }
+
+    /**
+     * @see org.apache.any23.extractor.Extractor#getDescription()
+     */
+    @Override
+    public ExtractorDescription getDescription() {
+        return OpenIEExtractorFactory.getDescriptionInstance();
+    }
+
+    @Override
+    public void run(ExtractionParameters extractionParameters,
+            ExtractionContext context, Document in, ExtractionResult out)
+                    throws IOException, ExtractionException {
+
+        IRI documentIRI = context.getDocumentIRI();
+        documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
+        out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
+        out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
+        LOG.debug("Processing: {}", documentIRI.toString());
+
+        OpenIE openIE = new OpenIE(
+                new ClearParser(
+                        new ClearPostagger(
+                                new ClearTokenizer())), new ClearSrl(), false, false);
+
+        Seq<Instance> extractions = null;
+        Tika tika = new Tika();
+        try {
+            extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
+        } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
+            LOG.error("Encountered error during OpenIE extraction.", e);
+        } catch (TikaException e) {
+            LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
+        }
+
+        List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
+        // for each extraction instance we can obtain a number of extraction elements
+        // instance.confidence() - a confidence value for the extraction itself
+        // instance.extr().context() - an optional representation of the context for this extraction
+        // instance.extr().arg1().text() - subject
+        // instance.extr().rel().text() - predicate
+        // instance.extr().arg2s().text() - object
+        for(Instance instance : listExtractions) {
+            final Configuration immutableConf = DefaultConfiguration.singleton();
+            if (instance.confidence() > Double.parseDouble(immutableConf.getProperty("any23.extraction.openie.confidence.threshold", "0.5"))) {
+                List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
+                for(Argument argument : listArg2s) {
+                    Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
+                    IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
+                    Value object = RDFUtils.toValue(argument.text());
+                    out.writeTriple(subject, predicate, object);
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java
----------------------------------------------------------------------
diff --git a/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java
new file mode 100644
index 0000000..4a1696a
--- /dev/null
+++ b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.openie;
+
+import java.util.Arrays;
+
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.SimpleExtractorFactory;
+import org.apache.any23.rdf.Prefixes;
+
+/**
+ * @author lewismc
+ *
+ */
+public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor>
+    implements ExtractorFactory<OpenIEExtractor> {
+
+    public static final String NAME = "openie";
+
+    public static final Prefixes prefixes = null;
+
+    private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory();
+
+    public OpenIEExtractorFactory() {
+        super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html");
+    }
+
+    @Override
+    public OpenIEExtractor createExtractor() {
+        return new OpenIEExtractor();
+    }
+
+    public static ExtractorDescription getDescriptionInstance() {
+        return descriptionInstance;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
----------------------------------------------------------------------
diff --git a/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
new file mode 100644
index 0000000..3561bdd
--- /dev/null
+++ b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.openie;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractionParameters;
+import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.ExtractionResultImpl;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.util.StreamUtils;
+import org.apache.any23.writer.RDFXMLWriter;
+import org.apache.any23.writer.TripleHandler;
+import org.apache.any23.writer.TripleHandlerException;
+import org.eclipse.rdf4j.model.IRI;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author lewismc
+ *
+ */
+public class OpenIEExtractorTest {
+
+    private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class);
+
+    private OpenIEExtractor extractor;
+
+    @Before
+    public void setUp() throws Exception {
+        extractor = new OpenIEExtractor();
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        extractor = null;
+    }
+
+    //@Ignore("This typically results in a JVM crash... disabled for the time being.")
+    @Test
+    public void testExtractFromHTMLDocument() 
+      throws IOException, ExtractionException, TripleHandlerException {
+        final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/aquarius");
+        extract(uri, "/org/apache/any23/extractor/openie/example-openie.html");
+    }
+    
+    public void extract(IRI uri, String filePath) 
+      throws IOException, ExtractionException, TripleHandlerException {
+      ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      final TripleHandler tHandler = new RDFXMLWriter(baos);
+      final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri);
+      final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler);
+      try {
+        extractor.run(
+                ExtractionParameters.newDefault(),
+                extractionContext,
+                StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)),
+                result
+        );
+      } finally {
+        logger.debug(baos.toString());
+        tHandler.close();
+        result.close();
+      }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 23ab57f..fffc7b5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -204,6 +204,7 @@
     <module>encoding</module>
     <module>core</module>
     <module>cli</module>
+    <module>openie</module>
     <module>plugins/basic-crawler</module>
     <module>plugins/html-scraper</module>
     <module>plugins/office-scraper</module>