You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by jg...@apache.org on 2017/09/13 10:55:27 UTC
[4/6] any23 git commit: Merge branch 'master' into ANY23-311
Merge branch 'master' into ANY23-311
- Resolve conflict in YAMLExtractor.java
Signed-off-by:Jacek Grzebyta <gr...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/94caa68e
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/94caa68e
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/94caa68e
Branch: refs/heads/master
Commit: 94caa68ec6f0a86281c147667e75bbd044e4f658
Parents: a56d549 c40b788
Author: Jacek Grzebyta <gr...@gmail.com>
Authored: Tue Aug 29 12:41:16 2017 +0100
Committer: Jacek Grzebyta <gr...@gmail.com>
Committed: Tue Aug 29 12:41:16 2017 +0100
----------------------------------------------------------------------
.../configuration/DefaultConfiguration.java | 23 +-
.../DefaultModifiableConfiguration.java | 4 +-
.../apache/any23/extractor/ExtractorGroup.java | 14 +-
.../java/org/apache/any23/vocab/Vocabulary.java | 26 +-
.../resources/default-configuration.properties | 4 +
cli/pom.xml | 22 +
.../org/apache/any23/cli/PluginVerifier.java | 8 +-
.../main/java/org/apache/any23/cli/Rover.java | 10 +-
.../java/org/apache/any23/cli/ToolRunner.java | 7 +-
.../org/apache/any23/cli/ToolRunnerTest.java | 15 +-
.../any23/extractor/ExtractorRegistryImpl.java | 30 +-
.../extractor/SingleDocumentExtraction.java | 6 +-
.../extractor/html/EmbeddedJSONLDExtractor.java | 4 +-
.../any23/extractor/html/GeoExtractor.java | 7 +-
.../any23/extractor/html/TagSoupParser.java | 2 -
.../any23/extractor/xpath/XPathExtractor.java | 3 +-
.../any23/extractor/yaml/YAMLExtractor.java | 18 +-
.../java/org/apache/any23/rdf/RDFUtils.java | 50 +-
.../java/org/apache/any23/util/StreamUtils.java | 69 +-
.../any23/extractor/ExtractionAPITest.java | 4 +-
.../extractor/ExtractionResultImplTest.java | 2 +-
csvutils/pom.xml | 2 +-
encoding/pom.xml | 2 +-
openie/pom.xml | 153 +++++
.../any23/extractor/openie/OpenIEExtractor.java | 130 ++++
.../openie/OpenIEExtractorFactory.java | 52 ++
.../org.apache.any23.extractor.ExtractorFactory | 1 +
.../any23/openie/OpenIEExtractorTest.java | 88 +++
plugins/basic-crawler/pom.xml | 53 +-
plugins/html-scraper/pom.xml | 19 -
plugins/integration-test/pom.xml | 16 +-
.../java/org/apache/any23/plugin/PluginIT.java | 40 +-
plugins/office-scraper/pom.xml | 19 -
pom.xml | 19 +-
service/pom.xml | 2 +-
src/site/apt/any23-plugins.apt | 16 +-
src/site/apt/configuration.apt | 8 +-
src/site/apt/dev-csv-extractor.apt | 2 +-
src/site/apt/dev-data-conversion.apt | 20 +-
src/site/apt/dev-data-extraction.apt | 20 +-
src/site/apt/dev-microformat-extractors.apt | 12 +-
src/site/apt/dev-validation-fix.apt | 12 +-
src/site/apt/dev-xpath-extractor.apt | 2 +-
src/site/apt/extractors.apt | 50 +-
src/site/apt/getting-started.apt | 2 +-
src/site/apt/plugin-basic-crawler.apt | 4 +-
src/site/apt/plugin-office-scraper.apt | 2 +-
.../any23/extractor/openie/example-openie.html | 638 +++++++++++++++++++
48 files changed, 1451 insertions(+), 261 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/94caa68e/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
----------------------------------------------------------------------
diff --cc core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
index bf70b63,1e968c0..4eae6b9
--- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
@@@ -75,10 -74,10 +76,10 @@@ public class YAMLExtractor implements E
// Iterate over page(s)
for (Object p : docIterate) {
- Resource pageNode = YAMLExtractor.this.makeUri("document", documentURI);
+ Resource pageNode = RDFUtils.makeIRI("document", documentIRI, true);
out.writeTriple(documentRoot, vocab.contains, pageNode);
out.writeTriple(pageNode, RDF.TYPE, vocab.document);
- buildNode(documentURI, p, out, pageNode);
- out.writeTriple(pageNode, vocab.contains, buildNode(documentIRI, p, out));
++ buildNode(documentIRI, p, out, pageNode);
}
}
@@@ -117,13 -116,12 +118,14 @@@
}
}
- private Value processMap(IRI file, Map<String, Object> node, ExtractionResult out) {
- Resource nodeURI = RDFUtils.makeIRI(file);
+ private Value processMap(IRI file, Map<String, Object> node, ExtractionResult out, Resource... parent) {
+ Resource nodeURI = Arrays.asList(parent).isEmpty() ? YAMLExtractor.this.makeUri(file) : parent[0];
+
++
for (String k : node.keySet()) {
- Resource predicate = makeUri(k, file, false);
+ Resource predicate = RDFUtils.makeIRI(k, file, true);
Value value = buildNode(file, node.get(k), out);
- out.writeTriple(nodeURI, RDF.TYPE, vocab.node);
+ out.writeTriple(nodeURI, RDF.TYPE, vocab.mapping);
out.writeTriple(nodeURI, (IRI) predicate, value);
out.writeTriple(predicate, RDF.TYPE, RDF.PREDICATE);
out.writeTriple(predicate, RDFS.LABEL, RDFUtils.literal(k));
@@@ -172,36 -158,4 +174,36 @@@
nodeId++;
return bnode;
}
+
+ private Resource makeUri(IRI docUri) {
+ return makeUri("node", docUri);
- }
++}
+
+ private Resource makeUri(String type, IRI docUri) {
+ return makeUri(type, docUri, true);
+ }
+
+ private Resource makeUri(String type, IRI docUri, boolean addId) {
+
+ // preprocess string: converts - -> _
+ // converts <space>: word1 word2 -> word1Word2
+ String newType = StringUtils.implementJavaNaming(type);
+
+ String uriString;
+ if (docUri.toString().endsWith("/")) {
+ uriString = docUri.toString() + newType;
+ } else {
+ uriString = docUri.toString() + "#" + newType;
+ }
+
+ if (addId) {
+ uriString = uriString + "_" + Integer.toString(nodeId);
+ }
+
+ Resource node = RDFUtils.uri(uriString);
+ if (addId) {
+ nodeId++;
+ }
+ return node;
+ }
}