You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/01/05 16:07:46 UTC
[38/52] [abbrv] jena git commit: Further rebranding to Elephas
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractQuadSplitWithNodesTests.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractQuadSplitWithNodesTests.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractQuadSplitWithNodesTests.java
new file mode 100644
index 0000000..80517b2
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractQuadSplitWithNodesTests.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.split;
+
+import org.apache.jena.hadoop.rdf.mapreduce.split.AbstractNodeTupleSplitToNodesMapper;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.QuadWritable;
+
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.NodeFactory;
+import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.Quad;
+
+/**
+ * Abstract tests for {@link AbstractNodeTupleSplitToNodesMapper}
+ * implementations that work on Quads
+ *
+ *
+ *
+ */
+public abstract class AbstractQuadSplitWithNodesTests extends AbstractNodeTupleSplitWithNodesTests<Quad, QuadWritable> {
+
+ @Override
+ protected QuadWritable createValue(int i) {
+ return new QuadWritable(new Quad(Quad.defaultGraphNodeGenerated, new Triple(
+ NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"),
+ NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger))));
+ }
+
+ @Override
+ protected NodeWritable[] getNodes(QuadWritable tuple) {
+ Quad q = tuple.get();
+ return new NodeWritable[] { new NodeWritable(q.getGraph()), new NodeWritable(q.getSubject()),
+ new NodeWritable(q.getPredicate()), new NodeWritable(q.getObject()) };
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitToNodesTests.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitToNodesTests.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitToNodesTests.java
new file mode 100644
index 0000000..7e497ab
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitToNodesTests.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.split;
+
+import org.apache.jena.hadoop.rdf.mapreduce.split.AbstractNodeTupleSplitToNodesMapper;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.NodeFactory;
+import com.hp.hpl.jena.graph.Triple;
+
+/**
+ * Abstract tests for {@link AbstractNodeTupleSplitToNodesMapper}
+ * implementations that work on Triples
+ *
+ *
+ *
+ */
+public abstract class AbstractTripleSplitToNodesTests extends AbstractNodeTupleSplitToNodesTests<Triple, TripleWritable> {
+
+ @Override
+ protected TripleWritable createValue(int i) {
+ return new TripleWritable(
+ new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"),
+ NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger)));
+ }
+
+ @Override
+ protected NodeWritable[] getNodes(TripleWritable tuple) {
+ Triple t = tuple.get();
+ return new NodeWritable[] { new NodeWritable(t.getSubject()), new NodeWritable(t.getPredicate()),
+ new NodeWritable(t.getObject()) };
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitWithNodesTests.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitWithNodesTests.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitWithNodesTests.java
new file mode 100644
index 0000000..babcad1
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/AbstractTripleSplitWithNodesTests.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.split;
+
+import org.apache.jena.hadoop.rdf.mapreduce.split.AbstractNodeTupleSplitToNodesMapper;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.NodeFactory;
+import com.hp.hpl.jena.graph.Triple;
+
+/**
+ * Abstract tests for {@link AbstractNodeTupleSplitToNodesMapper}
+ * implementations that work on Triples
+ *
+ *
+ *
+ */
+public abstract class AbstractTripleSplitWithNodesTests extends AbstractNodeTupleSplitWithNodesTests<Triple, TripleWritable> {
+
+ @Override
+ protected TripleWritable createValue(int i) {
+ return new TripleWritable(
+ new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"),
+ NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger)));
+ }
+
+ @Override
+ protected NodeWritable[] getNodes(TripleWritable tuple) {
+ Triple t = tuple.get();
+ return new NodeWritable[] { new NodeWritable(t.getSubject()), new NodeWritable(t.getPredicate()),
+ new NodeWritable(t.getObject()) };
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitToNodesMapperTest.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitToNodesMapperTest.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitToNodesMapperTest.java
new file mode 100644
index 0000000..61058c6
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitToNodesMapperTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.split;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.jena.hadoop.rdf.mapreduce.split.QuadSplitToNodesMapper;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.QuadWritable;
+
+
+/**
+ * Tests for the {@link QuadSplitToNodesMapper}
+ *
+ *
+ *
+ */
+public class QuadSplitToNodesMapperTest extends AbstractQuadSplitToNodesTests {
+
+ @Override
+ protected Mapper<LongWritable, QuadWritable, LongWritable, NodeWritable> getInstance() {
+ return new QuadSplitToNodesMapper<LongWritable>();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitWithNodesMapperTest.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitWithNodesMapperTest.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitWithNodesMapperTest.java
new file mode 100644
index 0000000..a171ffb
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/QuadSplitWithNodesMapperTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.split;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.jena.hadoop.rdf.mapreduce.split.QuadSplitWithNodesMapper;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.QuadWritable;
+
+
+/**
+ * Tests for the {@link QuadSplitWithNodesMapper}
+ *
+ *
+ *
+ */
+public class QuadSplitWithNodesMapperTest extends AbstractQuadSplitWithNodesTests {
+
+ @Override
+ protected Mapper<LongWritable, QuadWritable, QuadWritable, NodeWritable> getInstance() {
+ return new QuadSplitWithNodesMapper<LongWritable>();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitToNodesMapperTest.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitToNodesMapperTest.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitToNodesMapperTest.java
new file mode 100644
index 0000000..d91efca
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitToNodesMapperTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.split;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.jena.hadoop.rdf.mapreduce.split.TripleSplitToNodesMapper;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+
+
+/**
+ * Tests for the {@link TripleSplitToNodesMapper}
+ *
+ *
+ *
+ */
+public class TripleSplitToNodesMapperTest extends AbstractTripleSplitToNodesTests {
+
+ @Override
+ protected Mapper<LongWritable, TripleWritable, LongWritable, NodeWritable> getInstance() {
+ return new TripleSplitToNodesMapper<LongWritable>();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitWithNodesMapperTest.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitWithNodesMapperTest.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitWithNodesMapperTest.java
new file mode 100644
index 0000000..3b71f40
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/split/TripleSplitWithNodesMapperTest.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.split;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.jena.hadoop.rdf.mapreduce.split.TripleSplitToNodesMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.split.TripleSplitWithNodesMapper;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+
+
+/**
+ * Tests for the {@link TripleSplitToNodesMapper}
+ *
+ *
+ *
+ */
+public class TripleSplitWithNodesMapperTest extends AbstractTripleSplitWithNodesTests {
+
+ @Override
+ protected Mapper<LongWritable, TripleWritable, TripleWritable, NodeWritable> getInstance() {
+ return new TripleSplitWithNodesMapper<LongWritable>();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java
new file mode 100644
index 0000000..51b29cb
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.transform;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mrunit.mapreduce.MapDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+import org.apache.jena.hadoop.rdf.mapreduce.AbstractMapperTests;
+import org.apache.jena.hadoop.rdf.mapreduce.transform.QuadsToTriplesMapper;
+import org.apache.jena.hadoop.rdf.types.QuadWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+import org.junit.Test;
+
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.NodeFactory;
+import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.Quad;
+
+/**
+ * Tests for the {@link QuadsToTriplesMapper}
+ *
+ *
+ *
+ */
+public class QuadsToTriplesMapperTest extends AbstractMapperTests<LongWritable, QuadWritable, LongWritable, TripleWritable> {
+
+ @Override
+ protected Mapper<LongWritable, QuadWritable, LongWritable, TripleWritable> getInstance() {
+ return new QuadsToTriplesMapper<LongWritable>();
+ }
+
+ protected void generateData(MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver, int num) {
+ for (int i = 0; i < num; i++) {
+ Triple t = new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"),
+ NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger));
+ Quad q = new Quad(Quad.defaultGraphNodeGenerated, t);
+ driver.addInput(new LongWritable(i), new QuadWritable(q));
+ driver.addOutput(new LongWritable(i), new TripleWritable(t));
+ }
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void quads_to_triples_mapper_01() throws IOException {
+ MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver();
+
+ Triple t = new Triple(NodeFactory.createURI("http://s"), NodeFactory.createURI("http://p"),
+ NodeFactory.createLiteral("test"));
+ Quad q = new Quad(Quad.defaultGraphNodeGenerated, t);
+ driver.withInput(new Pair<LongWritable, QuadWritable>(new LongWritable(1), new QuadWritable(q))).withOutput(
+ new Pair<LongWritable, TripleWritable>(new LongWritable(1), new TripleWritable(t)));
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void quads_to_triples_mapper_02() throws IOException {
+ MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver();
+ this.generateData(driver, 100);
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void quads_to_triples_mapper_03() throws IOException {
+ MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver();
+ this.generateData(driver, 1000);
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void quads_to_triples_mapper_04() throws IOException {
+ MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver();
+ this.generateData(driver, 10000);
+ driver.runTest();
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java
new file mode 100644
index 0000000..bdf39f5
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.transform;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mrunit.mapreduce.MapDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+import org.apache.jena.hadoop.rdf.mapreduce.AbstractMapperTests;
+import org.apache.jena.hadoop.rdf.mapreduce.transform.TriplesToQuadsBySubjectMapper;
+import org.apache.jena.hadoop.rdf.types.QuadWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+import org.junit.Test;
+
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.NodeFactory;
+import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.Quad;
+
+/**
+ * Tests for the {@link TriplesToQuadsBySubjectMapper}
+ *
+ *
+ *
+ */
+public class TriplesToQuadsBySubjectMapperTest extends AbstractMapperTests<LongWritable, TripleWritable, LongWritable, QuadWritable> {
+
+ @Override
+ protected Mapper<LongWritable, TripleWritable, LongWritable, QuadWritable> getInstance() {
+ return new TriplesToQuadsBySubjectMapper<LongWritable>();
+ }
+
+ protected void generateData(MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver, int num) {
+ for (int i = 0; i < num; i++) {
+ Triple t = new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"),
+ NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger));
+ Quad q = new Quad(t.getSubject(), t);
+ driver.addInput(new LongWritable(i), new TripleWritable(t));
+ driver.addOutput(new LongWritable(i), new QuadWritable(q));
+ }
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_01() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+
+ Triple t = new Triple(NodeFactory.createURI("http://s"), NodeFactory.createURI("http://p"),
+ NodeFactory.createLiteral("test"));
+ Quad q = new Quad(t.getSubject(), t);
+ driver.withInput(new Pair<LongWritable, TripleWritable>(new LongWritable(1), new TripleWritable(t))).withOutput(
+ new Pair<LongWritable, QuadWritable>(new LongWritable(1), new QuadWritable(q)));
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_02() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+ this.generateData(driver, 100);
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_03() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+ this.generateData(driver, 1000);
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_04() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+ this.generateData(driver, 10000);
+ driver.runTest();
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java
new file mode 100644
index 0000000..b82f74b
--- /dev/null
+++ b/jena-elephas/jena-elephas-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.mapreduce.transform;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mrunit.mapreduce.MapDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+import org.apache.jena.hadoop.rdf.mapreduce.AbstractMapperTests;
+import org.apache.jena.hadoop.rdf.mapreduce.transform.TriplesToQuadsConstantGraphMapper;
+import org.apache.jena.hadoop.rdf.types.QuadWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+import org.junit.Test;
+
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.NodeFactory;
+import com.hp.hpl.jena.graph.Triple;
+import com.hp.hpl.jena.sparql.core.Quad;
+
+/**
+ * Tests for the {@link TriplesToQuadsConstantGraphMapper}
+ *
+ *
+ *
+ */
+public class TriplesToQuadsConstantGraphMapperTest extends AbstractMapperTests<LongWritable, TripleWritable, LongWritable, QuadWritable> {
+
+ @Override
+ protected Mapper<LongWritable, TripleWritable, LongWritable, QuadWritable> getInstance() {
+ return new TriplesToQuadsConstantGraphMapper<LongWritable>();
+ }
+
+ protected void generateData(MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver, int num) {
+ for (int i = 0; i < num; i++) {
+ Triple t = new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"),
+ NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger));
+ Quad q = new Quad(Quad.defaultGraphNodeGenerated, t);
+ driver.addInput(new LongWritable(i), new TripleWritable(t));
+ driver.addOutput(new LongWritable(i), new QuadWritable(q));
+ }
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_01() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+
+ Triple t = new Triple(NodeFactory.createURI("http://s"), NodeFactory.createURI("http://p"),
+ NodeFactory.createLiteral("test"));
+ Quad q = new Quad(Quad.defaultGraphNodeGenerated, t);
+ driver.withInput(new Pair<LongWritable, TripleWritable>(new LongWritable(1), new TripleWritable(t))).withOutput(
+ new Pair<LongWritable, QuadWritable>(new LongWritable(1), new QuadWritable(q)));
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_02() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+ this.generateData(driver, 100);
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_03() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+ this.generateData(driver, 1000);
+ driver.runTest();
+ }
+
+ /**
+ * Tests quads to triples conversion
+ *
+ * @throws IOException
+ */
+ @Test
+ public void triples_to_quads_mapper_04() throws IOException {
+ MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver();
+ this.generateData(driver, 10000);
+ driver.runTest();
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-stats/hadoop-job.xml
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-stats/hadoop-job.xml b/jena-elephas/jena-elephas-stats/hadoop-job.xml
new file mode 100644
index 0000000..de72645
--- /dev/null
+++ b/jena-elephas/jena-elephas-stats/hadoop-job.xml
@@ -0,0 +1,46 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<assembly>
+ <id>hadoop-job</id>
+ <formats>
+ <format>jar</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <unpack>false</unpack>
+ <scope>runtime</scope>
+ <outputDirectory>lib</outputDirectory>
+ <excludes>
+ <exclude>${groupId}:${artifactId}</exclude>
+ </excludes>
+ </dependencySet>
+ <dependencySet>
+ <unpack>true</unpack>
+ <includes>
+ <include>${groupId}:${artifactId}</include>
+ </includes>
+ </dependencySet>
+ </dependencySets>
+ <fileSets>
+ <fileSet>
+ <directory>${basedir}/target/test-classes</directory>
+ <outputDirectory>/</outputDirectory>
+ </fileSet>
+ </fileSets>
+</assembly>
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-stats/pom.xml
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-stats/pom.xml b/jena-elephas/jena-elephas-stats/pom.xml
new file mode 100644
index 0000000..526d060
--- /dev/null
+++ b/jena-elephas/jena-elephas-stats/pom.xml
@@ -0,0 +1,103 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-elephas</artifactId>
+ <version>0.9.0-SNAPSHOT</version>
+ </parent>
+ <artifactId>jena-elephas-stats</artifactId>
+ <name>Apache Jena - Elephas - Statistics Demo App</name>
+ <description>A demo application that can be run on Hadoop to produce a statistical analysis on arbitrary RDF inputs</description>
+
+ <dependencies>
+ <!-- Internal Project Dependencies -->
+ <dependency>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-elephas-io</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-elephas-mapreduce</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <!-- CLI related Dependencies -->
+ <dependency>
+ <groupId>io.airlift</groupId>
+ <artifactId>airline</artifactId>
+ <version>0.6</version>
+ </dependency>
+
+ <!-- Hadoop Dependencies -->
+ <!-- Note these will be provided on the Hadoop cluster hence the provided
+ scope -->
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-mapreduce-client-common</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Test Dependencies -->
+ <dependency>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-hadoop-rdf-mapreduce</artifactId>
+ <version>${project.version}</version>
+ <classifier>tests</classifier>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.mrunit</groupId>
+ <artifactId>mrunit</artifactId>
+ <scope>test</scope>
+ <classifier>hadoop2</classifier>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <!-- Assembly plugin is used to produce the runnable Hadoop JAR with all
+ dependencies contained therein -->
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <descriptors>
+ <descriptor>hadoop-job.xml</descriptor>
+ </descriptors>
+ </configuration>
+ <executions>
+ <execution>
+ <id>make-assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java b/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java
new file mode 100644
index 0000000..5f870ee
--- /dev/null
+++ b/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.stats;
+
+import io.airlift.command.Arguments;
+import io.airlift.command.Command;
+import io.airlift.command.Help;
+import io.airlift.command.HelpOption;
+import io.airlift.command.Option;
+import io.airlift.command.OptionType;
+import io.airlift.command.ParseArgumentsMissingException;
+import io.airlift.command.ParseArgumentsUnexpectedException;
+import io.airlift.command.ParseException;
+import io.airlift.command.ParseOptionMissingException;
+import io.airlift.command.ParseOptionMissingValueException;
+import io.airlift.command.SingleCommand;
+import io.airlift.command.model.CommandMetadata;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import javax.inject.Inject;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.jena.hadoop.rdf.stats.jobs.JobFactory;
+
+
+/**
+ * Entry point for the Hadoop job, handles launching all the relevant Hadoop
+ * jobs
+ */
+@Command(name = "bin/hadoop jar PATH_TO_JAR com.yarcdata.urika.hadoop.rdf.stats.RdfStats", description = "A command which computes statistics on RDF data using Hadoop")
+public class RdfStats implements Tool {
+
+ static final String ANSI_RED = "\u001B[31m";
+ static final String ANSI_RESET = "\u001B[0m";
+
+ private static final String DATA_TYPE_TRIPLES = "triples", DATA_TYPE_QUADS = "quads", DATA_TYPE_MIXED = "mixed";
+
+ /**
+ * Help option
+ */
+ @Inject
+ public HelpOption helpOption;
+
+ /**
+ * Gets/Sets whether all available statistics will be calculated
+ */
+ @Option(name = { "-a", "--all" }, description = "Requests that all available statistics be calculated", type = OptionType.COMMAND)
+ public boolean all = false;
+
+ /**
+ * Gets/Sets whether node usage counts will be calculated
+ */
+ @Option(name = { "-n", "--node-count" }, description = "Requests that node usage counts be calculated", type = OptionType.COMMAND)
+ public boolean nodeCount = false;
+
+ /**
+ * Gets/Sets whether characteristic sets will be calculated
+ */
+ @Option(name = { "-c", "--characteristic-sets" }, description = "Requests that characteristic sets be calculated", type = OptionType.COMMAND)
+ public boolean characteristicSets = false;
+
+ /**
+ * Gets/Sets whether type counts will be calculated
+ */
+ @Option(name = { "-t", "--type-counts" }, description = "Requests that rdf:type usage counts be calculated", type = OptionType.COMMAND)
+ public boolean typeCount = false;
+
+ /**
+ * Gets/Sets whether data type counts will be calculated
+ */
+ @Option(name = { "-d", "--data-types" }, description = "Requests that literal data type usage counts be calculated", type = OptionType.COMMAND)
+ public boolean dataTypeCount = false;
+
+ /**
+ * Gets/Sets whether namespace counts will be calculated
+ */
+ @Option(name = { "--namespaces" }, description = "Requests that namespace usage counts be calculated", type = OptionType.COMMAND)
+ public boolean namespaceCount = false;
+
+ /**
+ * Gets/Sets the input data type used
+ */
+ @Option(name = { "--input-type" }, allowedValues = { DATA_TYPE_MIXED, DATA_TYPE_QUADS, DATA_TYPE_TRIPLES }, description = "Specifies whether the input data is a mixture of quads and triples, just quads or just triples. Using the most specific data type will yield the most accurrate statistics")
+ public String inputType = DATA_TYPE_MIXED;
+
+ /**
+ * Gets/Sets the output path
+ */
+ @Option(name = { "-o", "--output" }, title = "OutputPath", description = "Sets the output path", arity = 1, required = true)
+ public String outputPath = null;
+
+ /**
+ * Gets/Sets the input path(s)
+ */
+ @Arguments(description = "Sets the input path(s)", title = "InputPath", required = true)
+ public List<String> inputPaths = new ArrayList<String>();
+
+ private Configuration config;
+
+ /**
+ * Entry point method
+ *
+ * @param args
+ * Arguments
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ try {
+ // Run and exit with result code if no errors bubble up
+ // Note that the exit code may still be a error code
+ int res = ToolRunner.run(new Configuration(true), new RdfStats(), args);
+ System.exit(res);
+ } catch (Exception e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ e.printStackTrace(System.err);
+ } finally {
+ System.err.print(ANSI_RESET);
+ }
+ // If any errors bubble up exit with non-zero code
+ System.exit(1);
+ }
+
+ private static void showUsage() {
+ CommandMetadata metadata = SingleCommand.singleCommand(RdfStats.class).getCommandMetadata();
+ StringBuilder builder = new StringBuilder();
+ Help.help(metadata, builder);
+ System.err.print(ANSI_RESET);
+ System.err.println(builder.toString());
+ System.exit(1);
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.config = conf;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return this.config;
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ try {
+ // Parse custom arguments
+ RdfStats cmd = SingleCommand.singleCommand(RdfStats.class).parse(args);
+
+ // Copy Hadoop configuration across
+ cmd.setConf(this.getConf());
+
+ // Show help if requested and exit with success
+ if (cmd.helpOption.showHelpIfRequested()) {
+ return 0;
+ }
+
+ // Run the command and exit with success
+ cmd.run();
+ return 0;
+
+ } catch (ParseOptionMissingException e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ System.err.println();
+ showUsage();
+ } catch (ParseOptionMissingValueException e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ System.err.println();
+ showUsage();
+ } catch (ParseArgumentsMissingException e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ System.err.println();
+ showUsage();
+ } catch (ParseArgumentsUnexpectedException e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ System.err.println();
+ showUsage();
+ // TODO Re-enable as and when we upgrade Airline
+ // } catch (ParseOptionIllegalValueException e) {
+ // System.err.println(ANSI_RED + e.getMessage());
+ // System.err.println();
+ // showUsage();
+ } catch (ParseException e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ System.err.println();
+ showUsage();
+ } catch (UnsupportedOperationException e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ } catch (Throwable e) {
+ System.err.println(ANSI_RED + e.getMessage());
+ e.printStackTrace(System.err);
+ } finally {
+ System.err.print(ANSI_RESET);
+ }
+ return 1;
+ }
+
+ private void run() throws Throwable {
+ if (!this.outputPath.endsWith("/")) {
+ this.outputPath += "/";
+ }
+
+ // If all statistics requested turn on all statistics
+ if (this.all) {
+ this.nodeCount = true;
+ this.characteristicSets = true;
+ this.typeCount = true;
+ this.dataTypeCount = true;
+ this.namespaceCount = true;
+ }
+
+ // How many statistics were requested?
+ int statsRequested = 0;
+ if (this.nodeCount)
+ statsRequested++;
+ if (this.characteristicSets)
+ statsRequested++;
+ if (this.typeCount)
+ statsRequested++;
+ if (this.dataTypeCount)
+ statsRequested++;
+ if (this.namespaceCount)
+ statsRequested++;
+
+ // Error if no statistics requested
+ if (statsRequested == 0) {
+ System.err
+ .println("You did not request any statistics to be calculated, please use one/more of the relevant options to select the statistics to be computed");
+ return;
+ }
+ int statsComputed = 1;
+
+ // Compute statistics
+ if (this.nodeCount) {
+ Job job = this.selectNodeCountJob();
+ statsComputed = this.computeStatistic(job, statsComputed, statsRequested);
+ }
+ if (this.typeCount) {
+ Job[] jobs = this.selectTypeCountJobs();
+ statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested);
+ }
+ if (this.dataTypeCount) {
+ Job job = this.selectDataTypeCountJob();
+ statsComputed = this.computeStatistic(job, statsComputed, statsRequested);
+ }
+ if (this.namespaceCount) {
+ Job job = this.selectNamespaceCountJob();
+ statsComputed = this.computeStatistic(job, statsComputed, statsRequested);
+ }
+ if (this.characteristicSets) {
+ Job[] jobs = this.selectCharacteristicSetJobs();
+ statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested);
+ }
+ }
+
+ private int computeStatistic(Job job, int statsComputed, int statsRequested) throws Throwable {
+ System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested));
+ this.runJob(job);
+ System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested));
+ System.out.println();
+ return ++statsComputed;
+ }
+
+ private int computeStatistic(Job[] jobs, boolean continueOnFailure, boolean continueOnError, int statsComputed,
+ int statsRequested) {
+ System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested));
+ this.runJobSequence(jobs, continueOnFailure, continueOnError);
+ System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested));
+ System.out.println();
+ return ++statsComputed;
+ }
+
+ private boolean runJob(Job job) throws Throwable {
+ System.out.println("Submitting Job " + job.getJobName());
+ long start = System.nanoTime();
+ try {
+ job.submit();
+ if (job.monitorAndPrintJob()) {
+ System.out.println("Job " + job.getJobName() + " succeeded");
+ return true;
+ } else {
+ System.out.println("Job " + job.getJobName() + " failed");
+ return false;
+ }
+ } catch (Throwable e) {
+ System.out.println("Unexpected failure in Job " + job.getJobName());
+ throw e;
+ } finally {
+ long end = System.nanoTime();
+ System.out.println("Job " + job.getJobName() + " finished after "
+ + String.format("%,d milliseconds", TimeUnit.NANOSECONDS.toMillis(end - start)));
+ System.out.println();
+ }
+ }
+
+ private void runJobSequence(Job[] jobs, boolean continueOnFailure, boolean continueOnError) {
+ for (int i = 0; i < jobs.length; i++) {
+ Job job = jobs[i];
+ try {
+ boolean success = this.runJob(job);
+ if (!success && !continueOnFailure)
+ throw new IllegalStateException("Unable to complete job sequence because Job " + job.getJobName() + " failed");
+ } catch (IllegalStateException e) {
+ throw e;
+ } catch (Throwable e) {
+ if (!continueOnError)
+ throw new IllegalStateException("Unable to complete job sequence because job " + job.getJobName()
+ + " errorred", e);
+ }
+ }
+ }
+
+ private Job selectNodeCountJob() throws IOException {
+ String realOutputPath = outputPath + "node-counts/";
+ String[] inputs = new String[this.inputPaths.size()];
+ this.inputPaths.toArray(inputs);
+
+ if (DATA_TYPE_QUADS.equals(this.inputType)) {
+ return JobFactory.getQuadNodeCountJob(this.config, inputs, realOutputPath);
+ } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
+ return JobFactory.getTripleNodeCountJob(this.config, inputs, realOutputPath);
+ } else {
+ return JobFactory.getNodeCountJob(this.config, inputs, realOutputPath);
+ }
+ }
+
+ private Job selectDataTypeCountJob() throws IOException {
+ String realOutputPath = outputPath + "data-type-counts/";
+ String[] inputs = new String[this.inputPaths.size()];
+ this.inputPaths.toArray(inputs);
+
+ if (DATA_TYPE_QUADS.equals(this.inputType)) {
+ return JobFactory.getQuadDataTypeCountJob(this.config, inputs, realOutputPath);
+ } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
+ return JobFactory.getTripleDataTypeCountJob(this.config, inputs, realOutputPath);
+ } else {
+ return JobFactory.getDataTypeCountJob(this.config, inputs, realOutputPath);
+ }
+ }
+
+ private Job selectNamespaceCountJob() throws IOException {
+ String realOutputPath = outputPath + "namespace-counts/";
+ String[] inputs = new String[this.inputPaths.size()];
+ this.inputPaths.toArray(inputs);
+
+ if (DATA_TYPE_QUADS.equals(this.inputType)) {
+ return JobFactory.getQuadNamespaceCountJob(this.config, inputs, realOutputPath);
+ } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
+ return JobFactory.getTripleNamespaceCountJob(this.config, inputs, realOutputPath);
+ } else {
+ return JobFactory.getNamespaceCountJob(this.config, inputs, realOutputPath);
+ }
+ }
+
+ private Job[] selectCharacteristicSetJobs() throws IOException {
+ String intermediateOutputPath = outputPath + "characteristics/intermediate/";
+ String finalOutputPath = outputPath + "characteristics/final/";
+ String[] inputs = new String[this.inputPaths.size()];
+ this.inputPaths.toArray(inputs);
+
+ if (DATA_TYPE_QUADS.equals(this.inputType)) {
+ return JobFactory.getQuadCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
+ } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
+ return JobFactory.getTripleCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
+ } else {
+ return JobFactory.getCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
+ }
+ }
+
+ private Job[] selectTypeCountJobs() throws IOException {
+ String intermediateOutputPath = outputPath + "type-declarations/";
+ String finalOutputPath = outputPath + "type-counts/";
+ String[] inputs = new String[this.inputPaths.size()];
+ this.inputPaths.toArray(inputs);
+
+ if (DATA_TYPE_QUADS.equals(this.inputType)) {
+ return JobFactory.getQuadTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
+ } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
+ return JobFactory.getTripleTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
+ } else {
+ return JobFactory.getTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java
----------------------------------------------------------------------
diff --git a/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java b/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java
new file mode 100644
index 0000000..55bb8af
--- /dev/null
+++ b/jena-elephas/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java
@@ -0,0 +1,757 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.stats.jobs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.jena.hadoop.rdf.io.input.QuadsInputFormat;
+import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
+import org.apache.jena.hadoop.rdf.io.input.TriplesOrQuadsInputFormat;
+import org.apache.jena.hadoop.rdf.io.input.nquads.NQuadsInputFormat;
+import org.apache.jena.hadoop.rdf.io.input.ntriples.NTriplesInputFormat;
+import org.apache.jena.hadoop.rdf.io.output.nquads.NQuadsOutputFormat;
+import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
+import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesOutputFormat;
+import org.apache.jena.hadoop.rdf.mapreduce.KeyMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.RdfMapReduceConstants;
+import org.apache.jena.hadoop.rdf.mapreduce.TextCountReducer;
+import org.apache.jena.hadoop.rdf.mapreduce.characteristics.CharacteristicSetReducer;
+import org.apache.jena.hadoop.rdf.mapreduce.characteristics.QuadCharacteristicSetGeneratingReducer;
+import org.apache.jena.hadoop.rdf.mapreduce.characteristics.TripleCharacteristicSetGeneratingReducer;
+import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
+import org.apache.jena.hadoop.rdf.mapreduce.count.QuadNodeCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.QuadDataTypeCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.TripleDataTypeCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.QuadNamespaceCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.TripleNamespaceCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadObjectCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.count.positional.TripleObjectCountMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.QuadFilterByPredicateMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.TripleFilterByPredicateUriMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.group.QuadGroupBySubjectMapper;
+import org.apache.jena.hadoop.rdf.mapreduce.group.TripleGroupBySubjectMapper;
+import org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable;
+import org.apache.jena.hadoop.rdf.types.NodeWritable;
+import org.apache.jena.hadoop.rdf.types.QuadWritable;
+import org.apache.jena.hadoop.rdf.types.TripleWritable;
+
+import com.hp.hpl.jena.vocabulary.RDF;
+
+/**
+ * Factory that can produce {@link Job} instances for computing various RDF
+ * statistics
+ *
+ *
+ *
+ */
+public class JobFactory {
+
+ /**
+ * Private constructor prevents instantiation
+ */
+ private JobFactory() {
+ }
+
+ /**
+ * Gets a job for computing node counts on RDF triple inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getTripleNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Triples Node Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(TripleNodeCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesInputFormat.class);
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a job for computing node counts on RDF quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getQuadNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Quads Node Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadNodeCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(QuadsInputFormat.class);
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a job for computing node counts on RDF triple and/or quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Node Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadNodeCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a sequence of jobs that can be used to compute characteristic sets
+ * for RDF triples
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param intermediateOutputPath
+ * Intermediate output path
+ * @param outputPath
+ * Final output path
+ * @return Sequence of jobs
+ * @throws IOException
+ */
+ public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
+ String outputPath) throws IOException {
+ Job[] jobs = new Job[2];
+
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Triples Characteristic Set (Generation)");
+
+ // Map/Reduce classes
+ job.setMapperClass(TripleGroupBySubjectMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(TripleWritable.class);
+ job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class);
+ job.setOutputKeyClass(CharacteristicSetWritable.class);
+ job.setOutputValueClass(NullWritable.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
+ SequenceFileOutputFormat.setCompressOutput(job, true);
+ FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
+ SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
+
+ jobs[0] = job;
+
+ job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Triples Characteristic Set (Reduction)");
+
+ // Map/Reduce classes
+ job.setMapperClass(KeyMapper.class);
+ job.setMapOutputKeyClass(CharacteristicSetWritable.class);
+ job.setMapOutputValueClass(CharacteristicSetWritable.class);
+ job.setReducerClass(CharacteristicSetReducer.class);
+ job.setOutputKeyClass(CharacteristicSetWritable.class);
+ job.setOutputValueClass(CharacteristicSetWritable.class);
+
+ // Input and Output
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ FileInputFormat.setInputPaths(job, intermediateOutputPath);
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ jobs[1] = job;
+ return jobs;
+ }
+
+ /**
+ * Gets a sequence of jobs that can be used to compute characteristic sets
+ * for RDF quads
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param intermediateOutputPath
+ * Intermediate output path
+ * @param outputPath
+ * Final output path
+ * @return Sequence of jobs
+ * @throws IOException
+ */
+ public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
+ String outputPath) throws IOException {
+ Job[] jobs = new Job[2];
+
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Quads Characteristic Set (Generation)");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadGroupBySubjectMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(QuadWritable.class);
+ job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
+ job.setOutputKeyClass(CharacteristicSetWritable.class);
+ job.setOutputValueClass(NullWritable.class);
+
+ // Input and Output
+ job.setInputFormatClass(QuadsInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
+ SequenceFileOutputFormat.setCompressOutput(job, true);
+ FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
+ SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
+
+ jobs[0] = job;
+
+ job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Quads Characteristic Set (Reduction)");
+
+ // Map/Reduce classes
+ job.setMapperClass(KeyMapper.class);
+ job.setMapOutputKeyClass(CharacteristicSetWritable.class);
+ job.setMapOutputValueClass(CharacteristicSetWritable.class);
+ job.setReducerClass(CharacteristicSetReducer.class);
+ job.setOutputKeyClass(CharacteristicSetWritable.class);
+ job.setOutputValueClass(CharacteristicSetWritable.class);
+
+ // Input and Output
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ FileInputFormat.setInputPaths(job, intermediateOutputPath);
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ jobs[1] = job;
+ return jobs;
+ }
+
+ /**
+ * Gets a sequence of jobs that can be used to compute characteristic sets
+ * for RDF triple and/or quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param intermediateOutputPath
+ * Intermediate output path
+ * @param outputPath
+ * Final output path
+ * @return Sequence of jobs
+ * @throws IOException
+ */
+ public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
+ String outputPath) throws IOException {
+ Job[] jobs = new Job[2];
+
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Characteristic Set (Generation)");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadGroupBySubjectMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(QuadWritable.class);
+ job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
+ job.setOutputKeyClass(CharacteristicSetWritable.class);
+ job.setOutputValueClass(NullWritable.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
+ SequenceFileOutputFormat.setCompressOutput(job, true);
+ FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
+ SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
+
+ jobs[0] = job;
+
+ job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Characteristic Set (Reduction)");
+
+ // Map/Reduce classes
+ job.setMapperClass(KeyMapper.class);
+ job.setMapOutputKeyClass(CharacteristicSetWritable.class);
+ job.setMapOutputValueClass(CharacteristicSetWritable.class);
+ job.setReducerClass(CharacteristicSetReducer.class);
+ job.setOutputKeyClass(CharacteristicSetWritable.class);
+ job.setOutputValueClass(CharacteristicSetWritable.class);
+
+ // Input and Output
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ FileInputFormat.setInputPaths(job, intermediateOutputPath);
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ jobs[1] = job;
+ return jobs;
+ }
+
+ /**
+ * Gets a job for computing type counts on RDF triple inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param intermediateOutputPath
+ * Path for intermediate output which will be all the type
+ * declaration triples present in the inputs
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job[] getTripleTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
+ String outputPath) throws IOException {
+ Job[] jobs = new Job[2];
+
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Type Triples Extraction");
+
+ // Map/Reduce classes
+ job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
+ job.setMapperClass(TripleFilterByPredicateUriMapper.class);
+ job.setMapOutputKeyClass(LongWritable.class);
+ job.setMapOutputValueClass(TripleWritable.class);
+
+ // Input and Output Format
+ job.setInputFormatClass(TriplesInputFormat.class);
+ job.setOutputFormatClass(NTriplesOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
+
+ jobs[0] = job;
+
+ // Object Node Usage count job
+ job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Triples Type Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(TripleObjectCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(NTriplesInputFormat.class);
+ NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
+ // better if this was
+ // intelligently
+ // configured
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, intermediateOutputPath);
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ jobs[1] = job;
+
+ return jobs;
+ }
+
+ /**
+ * Gets a job for computing type counts on RDF quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param intermediateOutputPath
+ * Path for intermediate output which will be all the type
+ * declaration quads present in the inputs
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job[] getQuadTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
+ String outputPath) throws IOException {
+ Job[] jobs = new Job[2];
+
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Type Quads Extraction");
+
+ // Map/Reduce classes
+ job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
+ job.setMapperClass(QuadFilterByPredicateMapper.class);
+ job.setMapOutputKeyClass(LongWritable.class);
+ job.setMapOutputValueClass(QuadWritable.class);
+
+ // Input and Output Format
+ job.setInputFormatClass(QuadsInputFormat.class);
+ job.setOutputFormatClass(NQuadsOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
+
+ jobs[0] = job;
+
+ // Object Node Usage count job
+ job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Quads Type Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadObjectCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(NQuadsInputFormat.class);
+ NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
+ // better if this was
+ // intelligently
+ // configured
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, intermediateOutputPath);
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ jobs[1] = job;
+
+ return jobs;
+ }
+
+ /**
+ * Gets a job for computing type counts on RDF triple and/or quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param intermediateOutputPath
+ * Path for intermediate output which will be all the type
+ * declaration quads present in the inputs
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job[] getTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
+ String outputPath) throws IOException {
+ Job[] jobs = new Job[2];
+
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Type Extraction");
+
+ // Map/Reduce classes
+ job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
+ job.setMapperClass(QuadFilterByPredicateMapper.class);
+ job.setMapOutputKeyClass(LongWritable.class);
+ job.setMapOutputValueClass(QuadWritable.class);
+
+ // Input and Output Format
+ job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
+ job.setOutputFormatClass(NQuadsOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
+
+ jobs[0] = job;
+
+ // Object Node Usage count job
+ job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Type Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadObjectCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(NQuadsInputFormat.class);
+ NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
+ // better if this was
+ // intelligently
+ // configured
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, intermediateOutputPath);
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ jobs[1] = job;
+
+ return jobs;
+ }
+
+ /**
+ * Gets a job for computing literal data type counts on RDF triple inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getTripleDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Triples Literal Data Type Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(TripleDataTypeCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesInputFormat.class);
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a job for computing literal data type counts on RDF quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getQuadDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Quads Literal Data Type Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadDataTypeCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(QuadsInputFormat.class);
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a job for computing literal data type counts on RDF triple and/or
+ * quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Literal Data Type Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadDataTypeCountMapper.class);
+ job.setMapOutputKeyClass(NodeWritable.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(NodeCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
+ job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a job for computing literal data type counts on RDF triple inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getTripleNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Triples Namespace Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(TripleNamespaceCountMapper.class);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(TextCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a job for computing literal data type counts on RDF quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getQuadNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Quads Namespace Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadNamespaceCountMapper.class);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(TextCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(QuadsInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+
+ /**
+ * Gets a job for computing literal data type counts on RDF triple and/or
+ * quad inputs
+ *
+ * @param config
+ * Configuration
+ * @param inputPaths
+ * Input paths
+ * @param outputPath
+ * Output path
+ * @return Job
+ * @throws IOException
+ */
+ public static Job getNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
+ Job job = Job.getInstance(config);
+ job.setJarByClass(JobFactory.class);
+ job.setJobName("RDF Namespace Usage Count");
+
+ // Map/Reduce classes
+ job.setMapperClass(QuadNamespaceCountMapper.class);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setReducerClass(TextCountReducer.class);
+
+ // Input and Output
+ job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
+ FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+ return job;
+ }
+}