You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/01/27 18:28:05 UTC
[12/59] [abbrv] jena git commit: Rebrand to Jena Elephas per
community vote
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java b/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java
deleted file mode 100644
index 9b6e307..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.jena.hadoop.rdf.io.output.trix;
-
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.jena.hadoop.rdf.io.output.AbstractQuadOutputFormatTests;
-import org.apache.jena.hadoop.rdf.types.QuadWritable;
-import org.apache.jena.riot.Lang;
-
-/**
- * Tests for TriX output format
- */
-public class TriXOutputTest extends AbstractQuadOutputFormatTests {
-
- @Override
- protected String getFileExtension() {
- return ".trix";
- }
-
- @Override
- protected Lang getRdfLanguage() {
- return Lang.TRIX;
- }
-
- @Override
- protected OutputFormat<NullWritable, QuadWritable> getOutputFormat() {
- return new TriXOutputFormat<NullWritable>();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java b/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java
deleted file mode 100644
index a6c4d70..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.jena.hadoop.rdf.io.output.turtle;
-
-import java.util.Arrays;
-import java.util.Collection;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.jena.hadoop.rdf.io.RdfIOConstants;
-import org.apache.jena.hadoop.rdf.io.output.AbstractTripleOutputFormatTests;
-import org.apache.jena.hadoop.rdf.types.TripleWritable;
-import org.apache.jena.riot.Lang;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-
-/**
- * Tests for Turtle output
- *
- *
- *
- */
-@RunWith(Parameterized.class)
-public class BatchedTurtleOutputTest extends AbstractTripleOutputFormatTests {
-
- static long $bs1 = RdfIOConstants.DEFAULT_OUTPUT_BATCH_SIZE;
- static long $bs2 = 1000;
- static long $bs3 = 100;
- static long $bs4 = 1;
-
- /**
- * @return Test parameters
- */
- @Parameters
- public static Collection<Object[]> data() {
- return Arrays.asList(new Object[][] { { $bs1 }, { $bs2 }, { $bs3 }, { $bs4 } });
- }
-
- private final long batchSize;
-
- /**
- * Creates new tests
- *
- * @param batchSize
- * Batch size
- */
- public BatchedTurtleOutputTest(long batchSize) {
- this.batchSize = batchSize;
- }
-
- @Override
- protected String getFileExtension() {
- return ".ttl";
- }
-
- @Override
- protected Lang getRdfLanguage() {
- return Lang.TURTLE;
- }
-
- @Override
- protected Configuration prepareConfiguration() {
- Configuration config = super.prepareConfiguration();
- config.setLong(RdfIOConstants.OUTPUT_BATCH_SIZE, this.batchSize);
- return config;
- }
-
- @Override
- protected OutputFormat<NullWritable, TripleWritable> getOutputFormat() {
- return new BatchedTurtleOutputFormat<NullWritable>();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java b/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java
deleted file mode 100644
index d8843d3..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.jena.hadoop.rdf.io.output.turtle;
-
-import java.util.Arrays;
-import java.util.Collection;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.jena.hadoop.rdf.io.RdfIOConstants;
-import org.apache.jena.hadoop.rdf.io.output.AbstractTripleOutputFormatTests;
-import org.apache.jena.hadoop.rdf.types.TripleWritable;
-import org.apache.jena.riot.Lang;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-
-/**
- * Tests for Turtle output
- *
- *
- *
- */
-@RunWith(Parameterized.class)
-public class StreamedTurtleOutputTest extends AbstractTripleOutputFormatTests {
-
- static long $bs1 = RdfIOConstants.DEFAULT_OUTPUT_BATCH_SIZE;
- static long $bs2 = 1000;
- static long $bs3 = 100;
- static long $bs4 = 1;
-
- /**
- * @return Test parameters
- */
- @Parameters
- public static Collection<Object[]> data() {
- return Arrays.asList(new Object[][] { { $bs1 }, { $bs2 }, { $bs3 }, { $bs4 } });
- }
-
- private final long batchSize;
-
- /**
- * Creates new tests
- *
- * @param batchSize
- * Batch size
- */
- public StreamedTurtleOutputTest(long batchSize) {
- this.batchSize = batchSize;
- }
-
- @Override
- protected String getFileExtension() {
- return ".ttl";
- }
-
- @Override
- protected Lang getRdfLanguage() {
- return Lang.TURTLE;
- }
-
- @Override
- protected Configuration prepareConfiguration() {
- Configuration config = super.prepareConfiguration();
- config.setLong(RdfIOConstants.OUTPUT_BATCH_SIZE, this.batchSize);
- return config;
- }
-
- @Override
- protected OutputFormat<NullWritable, TripleWritable> getOutputFormat() {
- return new TurtleOutputFormat<NullWritable>();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java b/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java
deleted file mode 100644
index 8dcae4e..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.jena.hadoop.rdf.io.output.turtle;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.jena.hadoop.rdf.io.RdfIOConstants;
-import org.apache.jena.hadoop.rdf.types.TripleWritable;
-import org.apache.jena.riot.RDFDataMgr;
-import org.junit.Assert;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
-import com.hp.hpl.jena.graph.Node;
-import com.hp.hpl.jena.graph.NodeFactory;
-import com.hp.hpl.jena.graph.Triple;
-import com.hp.hpl.jena.rdf.model.Model;
-import com.hp.hpl.jena.rdf.model.ResIterator;
-import com.hp.hpl.jena.rdf.model.Resource;
-
-/**
- * Tests for Turtle output with blank nodes
- *
- *
- *
- */
-@RunWith(Parameterized.class)
-public class TurtleBlankNodeOutputTests extends StreamedTurtleOutputTest {
-
- static long $bs1 = RdfIOConstants.DEFAULT_OUTPUT_BATCH_SIZE;
- static long $bs2 = 1000;
- static long $bs3 = 100;
- static long $bs4 = 1;
-
- /**
- * @return Test parameters
- */
- @Parameters
- public static Collection<Object[]> data() {
- return Arrays.asList(new Object[][] { { $bs1 }, { $bs2 }, { $bs3 },
- { $bs4 } });
- }
-
- /**
- * Creates new tests
- *
- * @param batchSize
- * Batch size
- */
- public TurtleBlankNodeOutputTests(long batchSize) {
- super(batchSize);
- }
-
- @Override
- protected Iterator<TripleWritable> generateTuples(int num) {
- List<TripleWritable> ts = new ArrayList<TripleWritable>();
- Node subject = NodeFactory.createAnon();
- for (int i = 0; i < num; i++) {
- Triple t = new Triple(subject,
- NodeFactory.createURI("http://example.org/predicate"),
- NodeFactory.createLiteral(Integer.toString(i),
- XSDDatatype.XSDinteger));
- ts.add(new TripleWritable(t));
- }
- return ts.iterator();
- }
-
- @Override
- protected void checkTuples(File f, long expected) {
- super.checkTuples(f, expected);
-
- Model m = RDFDataMgr.loadModel("file://" + f.getAbsolutePath(),
- this.getRdfLanguage());
- ResIterator iter = m.listSubjects();
- Set<Node> subjects = new HashSet<Node>();
- while (iter.hasNext()) {
- Resource res = iter.next();
- Assert.assertTrue(res.isAnon());
- subjects.add(res.asNode());
- }
- // Should only be one subject unless the data was empty in which case
- // there will be zero subjects
- Assert.assertEquals(expected == 0 ? 0 : 1, subjects.size());
- }
-
- @Override
- protected OutputFormat<NullWritable, TripleWritable> getOutputFormat() {
- return new TurtleOutputFormat<NullWritable>();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java b/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java
deleted file mode 100644
index 2eae232..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.jena.hadoop.rdf.io.registry;
-
-import java.io.IOException;
-import java.io.StringWriter;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.jena.hadoop.rdf.types.QuadWritable;
-import org.apache.jena.hadoop.rdf.types.TripleWritable;
-import org.apache.jena.riot.Lang;
-import org.apache.jena.riot.RDFLanguages;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Tests for the {@link HadoopRdfIORegistry}
- */
-public class TestHadoopRdfIORegistry {
-
- private void testLang(Lang lang, boolean triples, boolean quads, boolean writesSupported) {
- Assert.assertEquals(triples, HadoopRdfIORegistry.hasTriplesReader(lang));
- Assert.assertEquals(quads, HadoopRdfIORegistry.hasQuadReader(lang));
-
- // Some formats may be asymmetric
- if (writesSupported) {
- Assert.assertEquals(triples, HadoopRdfIORegistry.hasTriplesWriter(lang));
- Assert.assertEquals(quads, HadoopRdfIORegistry.hasQuadWriter(lang));
- } else {
- Assert.assertFalse(HadoopRdfIORegistry.hasTriplesWriter(lang));
- Assert.assertFalse(HadoopRdfIORegistry.hasQuadWriter(lang));
- }
-
- if (triples) {
- // Check that triples are supported
- RecordReader<LongWritable, TripleWritable> tripleReader;
- try {
- tripleReader = HadoopRdfIORegistry.createTripleReader(lang);
- Assert.assertNotNull(tripleReader);
- } catch (IOException e) {
- Assert.fail("Registry indicates that " + lang.getName()
- + " can read triples but fails to produce a triple reader when asked: " + e.getMessage());
- }
-
- if (writesSupported) {
- RecordWriter<NullWritable, TripleWritable> tripleWriter;
- try {
- tripleWriter = HadoopRdfIORegistry.createTripleWriter(lang, new StringWriter(), new Configuration(
- false));
- Assert.assertNotNull(tripleWriter);
- } catch (IOException e) {
- Assert.fail("Registry indicates that " + lang.getName()
- + " can write triples but fails to produce a triple writer when asked: " + e.getMessage());
- }
- }
- } else {
- // Check that triples are not supported
- try {
- HadoopRdfIORegistry.createTripleReader(lang);
- Assert.fail("Registry indicates that " + lang.getName()
- + " cannot read triples but produced a triple reader when asked (error was expected)");
- } catch (IOException e) {
- // This is expected
- }
- try {
- HadoopRdfIORegistry.createTripleWriter(lang, new StringWriter(), new Configuration(false));
- Assert.fail("Registry indicates that " + lang.getName()
- + " cannot write triples but produced a triple write when asked (error was expected)");
- } catch (IOException e) {
- // This is expected
- }
- }
-
- if (quads) {
- // Check that quads are supported
- RecordReader<LongWritable, QuadWritable> quadReader;
- try {
- quadReader = HadoopRdfIORegistry.createQuadReader(lang);
- Assert.assertNotNull(quadReader);
- } catch (IOException e) {
- Assert.fail("Registry indicates that " + lang.getName()
- + " can read quads but fails to produce a quad reader when asked: " + e.getMessage());
- }
-
- if (writesSupported) {
- RecordWriter<NullWritable, QuadWritable> quadWriter;
- try {
- quadWriter = HadoopRdfIORegistry.createQuadWriter(lang, new StringWriter(),
- new Configuration(false));
- Assert.assertNotNull(quadWriter);
- } catch (IOException e) {
- Assert.fail("Registry indicates that " + lang.getName()
- + " can write quads but fails to produce a triple writer when asked: " + e.getMessage());
- }
- }
- } else {
- try {
- HadoopRdfIORegistry.createQuadReader(lang);
- Assert.fail("Registry indicates that " + lang.getName()
- + " cannot read quads but produced a quad reader when asked (error was expected)");
- } catch (IOException e) {
- // This is expected
- }
- try {
- HadoopRdfIORegistry.createQuadWriter(lang, new StringWriter(), new Configuration(false));
- Assert.fail("Registry indicates that " + lang.getName()
- + " cannot write quads but produced a quad writer when asked (error was expected)");
- } catch (IOException e) {
- // This is expected
- }
- }
- }
-
- @Test
- public void json_ld_registered() {
- testLang(Lang.JSONLD, true, true, true);
- }
-
- @Test
- public void nquads_registered() {
- testLang(Lang.NQUADS, false, true, true);
- testLang(Lang.NQ, false, true, true);
- }
-
- @Test
- public void ntriples_registered() {
- testLang(Lang.NTRIPLES, true, false, true);
- testLang(Lang.NT, true, false, true);
- }
-
- @Test
- public void rdf_json_registered() {
- testLang(Lang.RDFJSON, true, false, true);
- }
-
- @Test
- public void rdf_xml_registered() {
- testLang(Lang.RDFXML, true, false, true);
- }
-
- @Test
- public void rdf_thrift_registered() {
- testLang(RDFLanguages.THRIFT, true, true, true);
- }
-
- @Test
- public void trig_registered() {
- testLang(Lang.TRIG, false, true, true);
- }
-
- @Test
- public void trix_registered() {
- testLang(Lang.TRIX, false, true, true);
- }
-
- @Test
- public void turtle_registered() {
- testLang(Lang.TURTLE, true, false, true);
- testLang(Lang.TTL, true, false, true);
- testLang(Lang.N3, true, false, true);
- }
-
- @Test
- public void unregistered() {
- testLang(Lang.RDFNULL, false, false, true);
- }
-}
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml b/jena-hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml
deleted file mode 100644
index de72645..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<assembly>
- <id>hadoop-job</id>
- <formats>
- <format>jar</format>
- </formats>
- <includeBaseDirectory>false</includeBaseDirectory>
- <dependencySets>
- <dependencySet>
- <unpack>false</unpack>
- <scope>runtime</scope>
- <outputDirectory>lib</outputDirectory>
- <excludes>
- <exclude>${groupId}:${artifactId}</exclude>
- </excludes>
- </dependencySet>
- <dependencySet>
- <unpack>true</unpack>
- <includes>
- <include>${groupId}:${artifactId}</include>
- </includes>
- </dependencySet>
- </dependencySets>
- <fileSets>
- <fileSet>
- <directory>${basedir}/target/test-classes</directory>
- <outputDirectory>/</outputDirectory>
- </fileSet>
- </fileSets>
-</assembly>
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-stats/pom.xml
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-stats/pom.xml b/jena-hadoop-rdf/hadoop-rdf-stats/pom.xml
deleted file mode 100644
index bf69fa6..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-stats/pom.xml
+++ /dev/null
@@ -1,103 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.jena</groupId>
- <artifactId>jena-hadoop-rdf</artifactId>
- <version>0.9.0-SNAPSHOT</version>
- </parent>
- <artifactId>jena-hadoop-rdf-stats</artifactId>
- <name>Apache Jena - RDF Tools for Hadoop - Statistics Demo App</name>
- <description>A demo application that can be run on Hadoop to produce a statistical analysis on arbitrary RDF inputs</description>
-
- <dependencies>
- <!-- Internal Project Dependencies -->
- <dependency>
- <groupId>org.apache.jena</groupId>
- <artifactId>jena-hadoop-rdf-io</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.jena</groupId>
- <artifactId>jena-hadoop-rdf-mapreduce</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <!-- CLI related Dependencies -->
- <dependency>
- <groupId>io.airlift</groupId>
- <artifactId>airline</artifactId>
- <version>0.6</version>
- </dependency>
-
- <!-- Hadoop Dependencies -->
- <!-- Note these will be provided on the Hadoop cluster hence the provided
- scope -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-mapreduce-client-common</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <!-- Test Dependencies -->
- <dependency>
- <groupId>org.apache.jena</groupId>
- <artifactId>jena-hadoop-rdf-mapreduce</artifactId>
- <version>${project.version}</version>
- <classifier>tests</classifier>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.mrunit</groupId>
- <artifactId>mrunit</artifactId>
- <scope>test</scope>
- <classifier>hadoop2</classifier>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <!-- Assembly plugin is used to produce the runnable Hadoop JAR with all
- dependencies contained therein -->
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <configuration>
- <descriptors>
- <descriptor>hadoop-job.xml</descriptor>
- </descriptors>
- </configuration>
- <executions>
- <execution>
- <id>make-assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
-</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java b/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java
deleted file mode 100644
index 5f870ee..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.jena.hadoop.rdf.stats;
-
-import io.airlift.command.Arguments;
-import io.airlift.command.Command;
-import io.airlift.command.Help;
-import io.airlift.command.HelpOption;
-import io.airlift.command.Option;
-import io.airlift.command.OptionType;
-import io.airlift.command.ParseArgumentsMissingException;
-import io.airlift.command.ParseArgumentsUnexpectedException;
-import io.airlift.command.ParseException;
-import io.airlift.command.ParseOptionMissingException;
-import io.airlift.command.ParseOptionMissingValueException;
-import io.airlift.command.SingleCommand;
-import io.airlift.command.model.CommandMetadata;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.TimeUnit;
-
-import javax.inject.Inject;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.jena.hadoop.rdf.stats.jobs.JobFactory;
-
-
-/**
- * Entry point for the Hadoop job, handles launching all the relevant Hadoop
- * jobs
- */
-@Command(name = "bin/hadoop jar PATH_TO_JAR com.yarcdata.urika.hadoop.rdf.stats.RdfStats", description = "A command which computes statistics on RDF data using Hadoop")
-public class RdfStats implements Tool {
-
- static final String ANSI_RED = "\u001B[31m";
- static final String ANSI_RESET = "\u001B[0m";
-
- private static final String DATA_TYPE_TRIPLES = "triples", DATA_TYPE_QUADS = "quads", DATA_TYPE_MIXED = "mixed";
-
- /**
- * Help option
- */
- @Inject
- public HelpOption helpOption;
-
- /**
- * Gets/Sets whether all available statistics will be calculated
- */
- @Option(name = { "-a", "--all" }, description = "Requests that all available statistics be calculated", type = OptionType.COMMAND)
- public boolean all = false;
-
- /**
- * Gets/Sets whether node usage counts will be calculated
- */
- @Option(name = { "-n", "--node-count" }, description = "Requests that node usage counts be calculated", type = OptionType.COMMAND)
- public boolean nodeCount = false;
-
- /**
- * Gets/Sets whether characteristic sets will be calculated
- */
- @Option(name = { "-c", "--characteristic-sets" }, description = "Requests that characteristic sets be calculated", type = OptionType.COMMAND)
- public boolean characteristicSets = false;
-
- /**
- * Gets/Sets whether type counts will be calculated
- */
- @Option(name = { "-t", "--type-counts" }, description = "Requests that rdf:type usage counts be calculated", type = OptionType.COMMAND)
- public boolean typeCount = false;
-
- /**
- * Gets/Sets whether data type counts will be calculated
- */
- @Option(name = { "-d", "--data-types" }, description = "Requests that literal data type usage counts be calculated", type = OptionType.COMMAND)
- public boolean dataTypeCount = false;
-
- /**
- * Gets/Sets whether namespace counts will be calculated
- */
- @Option(name = { "--namespaces" }, description = "Requests that namespace usage counts be calculated", type = OptionType.COMMAND)
- public boolean namespaceCount = false;
-
- /**
- * Gets/Sets the input data type used
- */
- @Option(name = { "--input-type" }, allowedValues = { DATA_TYPE_MIXED, DATA_TYPE_QUADS, DATA_TYPE_TRIPLES }, description = "Specifies whether the input data is a mixture of quads and triples, just quads or just triples. Using the most specific data type will yield the most accurrate statistics")
- public String inputType = DATA_TYPE_MIXED;
-
- /**
- * Gets/Sets the output path
- */
- @Option(name = { "-o", "--output" }, title = "OutputPath", description = "Sets the output path", arity = 1, required = true)
- public String outputPath = null;
-
- /**
- * Gets/Sets the input path(s)
- */
- @Arguments(description = "Sets the input path(s)", title = "InputPath", required = true)
- public List<String> inputPaths = new ArrayList<String>();
-
- private Configuration config;
-
- /**
- * Entry point method
- *
- * @param args
- * Arguments
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- try {
- // Run and exit with result code if no errors bubble up
- // Note that the exit code may still be a error code
- int res = ToolRunner.run(new Configuration(true), new RdfStats(), args);
- System.exit(res);
- } catch (Exception e) {
- System.err.println(ANSI_RED + e.getMessage());
- e.printStackTrace(System.err);
- } finally {
- System.err.print(ANSI_RESET);
- }
- // If any errors bubble up exit with non-zero code
- System.exit(1);
- }
-
- private static void showUsage() {
- CommandMetadata metadata = SingleCommand.singleCommand(RdfStats.class).getCommandMetadata();
- StringBuilder builder = new StringBuilder();
- Help.help(metadata, builder);
- System.err.print(ANSI_RESET);
- System.err.println(builder.toString());
- System.exit(1);
- }
-
- @Override
- public void setConf(Configuration conf) {
- this.config = conf;
- }
-
- @Override
- public Configuration getConf() {
- return this.config;
- }
-
- @Override
- public int run(String[] args) throws Exception {
- try {
- // Parse custom arguments
- RdfStats cmd = SingleCommand.singleCommand(RdfStats.class).parse(args);
-
- // Copy Hadoop configuration across
- cmd.setConf(this.getConf());
-
- // Show help if requested and exit with success
- if (cmd.helpOption.showHelpIfRequested()) {
- return 0;
- }
-
- // Run the command and exit with success
- cmd.run();
- return 0;
-
- } catch (ParseOptionMissingException e) {
- System.err.println(ANSI_RED + e.getMessage());
- System.err.println();
- showUsage();
- } catch (ParseOptionMissingValueException e) {
- System.err.println(ANSI_RED + e.getMessage());
- System.err.println();
- showUsage();
- } catch (ParseArgumentsMissingException e) {
- System.err.println(ANSI_RED + e.getMessage());
- System.err.println();
- showUsage();
- } catch (ParseArgumentsUnexpectedException e) {
- System.err.println(ANSI_RED + e.getMessage());
- System.err.println();
- showUsage();
- // TODO Re-enable as and when we upgrade Airline
- // } catch (ParseOptionIllegalValueException e) {
- // System.err.println(ANSI_RED + e.getMessage());
- // System.err.println();
- // showUsage();
- } catch (ParseException e) {
- System.err.println(ANSI_RED + e.getMessage());
- System.err.println();
- showUsage();
- } catch (UnsupportedOperationException e) {
- System.err.println(ANSI_RED + e.getMessage());
- } catch (Throwable e) {
- System.err.println(ANSI_RED + e.getMessage());
- e.printStackTrace(System.err);
- } finally {
- System.err.print(ANSI_RESET);
- }
- return 1;
- }
-
- private void run() throws Throwable {
- if (!this.outputPath.endsWith("/")) {
- this.outputPath += "/";
- }
-
- // If all statistics requested turn on all statistics
- if (this.all) {
- this.nodeCount = true;
- this.characteristicSets = true;
- this.typeCount = true;
- this.dataTypeCount = true;
- this.namespaceCount = true;
- }
-
- // How many statistics were requested?
- int statsRequested = 0;
- if (this.nodeCount)
- statsRequested++;
- if (this.characteristicSets)
- statsRequested++;
- if (this.typeCount)
- statsRequested++;
- if (this.dataTypeCount)
- statsRequested++;
- if (this.namespaceCount)
- statsRequested++;
-
- // Error if no statistics requested
- if (statsRequested == 0) {
- System.err
- .println("You did not request any statistics to be calculated, please use one/more of the relevant options to select the statistics to be computed");
- return;
- }
- int statsComputed = 1;
-
- // Compute statistics
- if (this.nodeCount) {
- Job job = this.selectNodeCountJob();
- statsComputed = this.computeStatistic(job, statsComputed, statsRequested);
- }
- if (this.typeCount) {
- Job[] jobs = this.selectTypeCountJobs();
- statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested);
- }
- if (this.dataTypeCount) {
- Job job = this.selectDataTypeCountJob();
- statsComputed = this.computeStatistic(job, statsComputed, statsRequested);
- }
- if (this.namespaceCount) {
- Job job = this.selectNamespaceCountJob();
- statsComputed = this.computeStatistic(job, statsComputed, statsRequested);
- }
- if (this.characteristicSets) {
- Job[] jobs = this.selectCharacteristicSetJobs();
- statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested);
- }
- }
-
- private int computeStatistic(Job job, int statsComputed, int statsRequested) throws Throwable {
- System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested));
- this.runJob(job);
- System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested));
- System.out.println();
- return ++statsComputed;
- }
-
- private int computeStatistic(Job[] jobs, boolean continueOnFailure, boolean continueOnError, int statsComputed,
- int statsRequested) {
- System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested));
- this.runJobSequence(jobs, continueOnFailure, continueOnError);
- System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested));
- System.out.println();
- return ++statsComputed;
- }
-
- private boolean runJob(Job job) throws Throwable {
- System.out.println("Submitting Job " + job.getJobName());
- long start = System.nanoTime();
- try {
- job.submit();
- if (job.monitorAndPrintJob()) {
- System.out.println("Job " + job.getJobName() + " succeeded");
- return true;
- } else {
- System.out.println("Job " + job.getJobName() + " failed");
- return false;
- }
- } catch (Throwable e) {
- System.out.println("Unexpected failure in Job " + job.getJobName());
- throw e;
- } finally {
- long end = System.nanoTime();
- System.out.println("Job " + job.getJobName() + " finished after "
- + String.format("%,d milliseconds", TimeUnit.NANOSECONDS.toMillis(end - start)));
- System.out.println();
- }
- }
-
- private void runJobSequence(Job[] jobs, boolean continueOnFailure, boolean continueOnError) {
- for (int i = 0; i < jobs.length; i++) {
- Job job = jobs[i];
- try {
- boolean success = this.runJob(job);
- if (!success && !continueOnFailure)
- throw new IllegalStateException("Unable to complete job sequence because Job " + job.getJobName() + " failed");
- } catch (IllegalStateException e) {
- throw e;
- } catch (Throwable e) {
- if (!continueOnError)
- throw new IllegalStateException("Unable to complete job sequence because job " + job.getJobName()
- + " errorred", e);
- }
- }
- }
-
- private Job selectNodeCountJob() throws IOException {
- String realOutputPath = outputPath + "node-counts/";
- String[] inputs = new String[this.inputPaths.size()];
- this.inputPaths.toArray(inputs);
-
- if (DATA_TYPE_QUADS.equals(this.inputType)) {
- return JobFactory.getQuadNodeCountJob(this.config, inputs, realOutputPath);
- } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
- return JobFactory.getTripleNodeCountJob(this.config, inputs, realOutputPath);
- } else {
- return JobFactory.getNodeCountJob(this.config, inputs, realOutputPath);
- }
- }
-
- private Job selectDataTypeCountJob() throws IOException {
- String realOutputPath = outputPath + "data-type-counts/";
- String[] inputs = new String[this.inputPaths.size()];
- this.inputPaths.toArray(inputs);
-
- if (DATA_TYPE_QUADS.equals(this.inputType)) {
- return JobFactory.getQuadDataTypeCountJob(this.config, inputs, realOutputPath);
- } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
- return JobFactory.getTripleDataTypeCountJob(this.config, inputs, realOutputPath);
- } else {
- return JobFactory.getDataTypeCountJob(this.config, inputs, realOutputPath);
- }
- }
-
- private Job selectNamespaceCountJob() throws IOException {
- String realOutputPath = outputPath + "namespace-counts/";
- String[] inputs = new String[this.inputPaths.size()];
- this.inputPaths.toArray(inputs);
-
- if (DATA_TYPE_QUADS.equals(this.inputType)) {
- return JobFactory.getQuadNamespaceCountJob(this.config, inputs, realOutputPath);
- } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
- return JobFactory.getTripleNamespaceCountJob(this.config, inputs, realOutputPath);
- } else {
- return JobFactory.getNamespaceCountJob(this.config, inputs, realOutputPath);
- }
- }
-
- private Job[] selectCharacteristicSetJobs() throws IOException {
- String intermediateOutputPath = outputPath + "characteristics/intermediate/";
- String finalOutputPath = outputPath + "characteristics/final/";
- String[] inputs = new String[this.inputPaths.size()];
- this.inputPaths.toArray(inputs);
-
- if (DATA_TYPE_QUADS.equals(this.inputType)) {
- return JobFactory.getQuadCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
- } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
- return JobFactory.getTripleCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
- } else {
- return JobFactory.getCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
- }
- }
-
- private Job[] selectTypeCountJobs() throws IOException {
- String intermediateOutputPath = outputPath + "type-declarations/";
- String finalOutputPath = outputPath + "type-counts/";
- String[] inputs = new String[this.inputPaths.size()];
- this.inputPaths.toArray(inputs);
-
- if (DATA_TYPE_QUADS.equals(this.inputType)) {
- return JobFactory.getQuadTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
- } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) {
- return JobFactory.getTripleTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
- } else {
- return JobFactory.getTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java b/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java
deleted file mode 100644
index 55bb8af..0000000
--- a/jena-hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.jena.hadoop.rdf.stats.jobs;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.SequenceFile.CompressionType;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.BZip2Codec;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.jena.hadoop.rdf.io.input.QuadsInputFormat;
-import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
-import org.apache.jena.hadoop.rdf.io.input.TriplesOrQuadsInputFormat;
-import org.apache.jena.hadoop.rdf.io.input.nquads.NQuadsInputFormat;
-import org.apache.jena.hadoop.rdf.io.input.ntriples.NTriplesInputFormat;
-import org.apache.jena.hadoop.rdf.io.output.nquads.NQuadsOutputFormat;
-import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
-import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesOutputFormat;
-import org.apache.jena.hadoop.rdf.mapreduce.KeyMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.RdfMapReduceConstants;
-import org.apache.jena.hadoop.rdf.mapreduce.TextCountReducer;
-import org.apache.jena.hadoop.rdf.mapreduce.characteristics.CharacteristicSetReducer;
-import org.apache.jena.hadoop.rdf.mapreduce.characteristics.QuadCharacteristicSetGeneratingReducer;
-import org.apache.jena.hadoop.rdf.mapreduce.characteristics.TripleCharacteristicSetGeneratingReducer;
-import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
-import org.apache.jena.hadoop.rdf.mapreduce.count.QuadNodeCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.QuadDataTypeCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.TripleDataTypeCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.QuadNamespaceCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.TripleNamespaceCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadObjectCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.count.positional.TripleObjectCountMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.QuadFilterByPredicateMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.TripleFilterByPredicateUriMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.group.QuadGroupBySubjectMapper;
-import org.apache.jena.hadoop.rdf.mapreduce.group.TripleGroupBySubjectMapper;
-import org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable;
-import org.apache.jena.hadoop.rdf.types.NodeWritable;
-import org.apache.jena.hadoop.rdf.types.QuadWritable;
-import org.apache.jena.hadoop.rdf.types.TripleWritable;
-
-import com.hp.hpl.jena.vocabulary.RDF;
-
-/**
- * Factory that can produce {@link Job} instances for computing various RDF
- * statistics
- *
- *
- *
- */
-public class JobFactory {
-
- /**
- * Private constructor prevents instantiation
- */
- private JobFactory() {
- }
-
- /**
- * Gets a job for computing node counts on RDF triple inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getTripleNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Triples Node Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(TripleNodeCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesInputFormat.class);
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a job for computing node counts on RDF quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getQuadNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Quads Node Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadNodeCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(QuadsInputFormat.class);
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a job for computing node counts on RDF triple and/or quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Node Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadNodeCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a sequence of jobs that can be used to compute characteristic sets
- * for RDF triples
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param intermediateOutputPath
- * Intermediate output path
- * @param outputPath
- * Final output path
- * @return Sequence of jobs
- * @throws IOException
- */
- public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
- String outputPath) throws IOException {
- Job[] jobs = new Job[2];
-
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Triples Characteristic Set (Generation)");
-
- // Map/Reduce classes
- job.setMapperClass(TripleGroupBySubjectMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(TripleWritable.class);
- job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class);
- job.setOutputKeyClass(CharacteristicSetWritable.class);
- job.setOutputValueClass(NullWritable.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesInputFormat.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
- SequenceFileOutputFormat.setCompressOutput(job, true);
- FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
- SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
-
- jobs[0] = job;
-
- job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Triples Characteristic Set (Reduction)");
-
- // Map/Reduce classes
- job.setMapperClass(KeyMapper.class);
- job.setMapOutputKeyClass(CharacteristicSetWritable.class);
- job.setMapOutputValueClass(CharacteristicSetWritable.class);
- job.setReducerClass(CharacteristicSetReducer.class);
- job.setOutputKeyClass(CharacteristicSetWritable.class);
- job.setOutputValueClass(CharacteristicSetWritable.class);
-
- // Input and Output
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- FileInputFormat.setInputPaths(job, intermediateOutputPath);
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- jobs[1] = job;
- return jobs;
- }
-
- /**
- * Gets a sequence of jobs that can be used to compute characteristic sets
- * for RDF quads
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param intermediateOutputPath
- * Intermediate output path
- * @param outputPath
- * Final output path
- * @return Sequence of jobs
- * @throws IOException
- */
- public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
- String outputPath) throws IOException {
- Job[] jobs = new Job[2];
-
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Quads Characteristic Set (Generation)");
-
- // Map/Reduce classes
- job.setMapperClass(QuadGroupBySubjectMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(QuadWritable.class);
- job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
- job.setOutputKeyClass(CharacteristicSetWritable.class);
- job.setOutputValueClass(NullWritable.class);
-
- // Input and Output
- job.setInputFormatClass(QuadsInputFormat.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
- SequenceFileOutputFormat.setCompressOutput(job, true);
- FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
- SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
-
- jobs[0] = job;
-
- job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Quads Characteristic Set (Reduction)");
-
- // Map/Reduce classes
- job.setMapperClass(KeyMapper.class);
- job.setMapOutputKeyClass(CharacteristicSetWritable.class);
- job.setMapOutputValueClass(CharacteristicSetWritable.class);
- job.setReducerClass(CharacteristicSetReducer.class);
- job.setOutputKeyClass(CharacteristicSetWritable.class);
- job.setOutputValueClass(CharacteristicSetWritable.class);
-
- // Input and Output
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- FileInputFormat.setInputPaths(job, intermediateOutputPath);
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- jobs[1] = job;
- return jobs;
- }
-
- /**
- * Gets a sequence of jobs that can be used to compute characteristic sets
- * for RDF triple and/or quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param intermediateOutputPath
- * Intermediate output path
- * @param outputPath
- * Final output path
- * @return Sequence of jobs
- * @throws IOException
- */
- public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
- String outputPath) throws IOException {
- Job[] jobs = new Job[2];
-
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Characteristic Set (Generation)");
-
- // Map/Reduce classes
- job.setMapperClass(QuadGroupBySubjectMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(QuadWritable.class);
- job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
- job.setOutputKeyClass(CharacteristicSetWritable.class);
- job.setOutputValueClass(NullWritable.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
- SequenceFileOutputFormat.setCompressOutput(job, true);
- FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
- SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
-
- jobs[0] = job;
-
- job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Characteristic Set (Reduction)");
-
- // Map/Reduce classes
- job.setMapperClass(KeyMapper.class);
- job.setMapOutputKeyClass(CharacteristicSetWritable.class);
- job.setMapOutputValueClass(CharacteristicSetWritable.class);
- job.setReducerClass(CharacteristicSetReducer.class);
- job.setOutputKeyClass(CharacteristicSetWritable.class);
- job.setOutputValueClass(CharacteristicSetWritable.class);
-
- // Input and Output
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- FileInputFormat.setInputPaths(job, intermediateOutputPath);
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- jobs[1] = job;
- return jobs;
- }
-
- /**
- * Gets a job for computing type counts on RDF triple inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param intermediateOutputPath
- * Path for intermediate output which will be all the type
- * declaration triples present in the inputs
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job[] getTripleTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
- String outputPath) throws IOException {
- Job[] jobs = new Job[2];
-
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Type Triples Extraction");
-
- // Map/Reduce classes
- job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
- job.setMapperClass(TripleFilterByPredicateUriMapper.class);
- job.setMapOutputKeyClass(LongWritable.class);
- job.setMapOutputValueClass(TripleWritable.class);
-
- // Input and Output Format
- job.setInputFormatClass(TriplesInputFormat.class);
- job.setOutputFormatClass(NTriplesOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
-
- jobs[0] = job;
-
- // Object Node Usage count job
- job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Triples Type Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(TripleObjectCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(NTriplesInputFormat.class);
- NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
- // better if this was
- // intelligently
- // configured
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, intermediateOutputPath);
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- jobs[1] = job;
-
- return jobs;
- }
-
- /**
- * Gets a job for computing type counts on RDF quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param intermediateOutputPath
- * Path for intermediate output which will be all the type
- * declaration quads present in the inputs
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job[] getQuadTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
- String outputPath) throws IOException {
- Job[] jobs = new Job[2];
-
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Type Quads Extraction");
-
- // Map/Reduce classes
- job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
- job.setMapperClass(QuadFilterByPredicateMapper.class);
- job.setMapOutputKeyClass(LongWritable.class);
- job.setMapOutputValueClass(QuadWritable.class);
-
- // Input and Output Format
- job.setInputFormatClass(QuadsInputFormat.class);
- job.setOutputFormatClass(NQuadsOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
-
- jobs[0] = job;
-
- // Object Node Usage count job
- job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Quads Type Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadObjectCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(NQuadsInputFormat.class);
- NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
- // better if this was
- // intelligently
- // configured
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, intermediateOutputPath);
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- jobs[1] = job;
-
- return jobs;
- }
-
- /**
- * Gets a job for computing type counts on RDF triple and/or quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param intermediateOutputPath
- * Path for intermediate output which will be all the type
- * declaration quads present in the inputs
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job[] getTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
- String outputPath) throws IOException {
- Job[] jobs = new Job[2];
-
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Type Extraction");
-
- // Map/Reduce classes
- job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
- job.setMapperClass(QuadFilterByPredicateMapper.class);
- job.setMapOutputKeyClass(LongWritable.class);
- job.setMapOutputValueClass(QuadWritable.class);
-
- // Input and Output Format
- job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
- job.setOutputFormatClass(NQuadsOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
-
- jobs[0] = job;
-
- // Object Node Usage count job
- job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Type Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadObjectCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(NQuadsInputFormat.class);
- NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
- // better if this was
- // intelligently
- // configured
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, intermediateOutputPath);
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- jobs[1] = job;
-
- return jobs;
- }
-
- /**
- * Gets a job for computing literal data type counts on RDF triple inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getTripleDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Triples Literal Data Type Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(TripleDataTypeCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesInputFormat.class);
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a job for computing literal data type counts on RDF quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getQuadDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Quads Literal Data Type Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadDataTypeCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(QuadsInputFormat.class);
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a job for computing literal data type counts on RDF triple and/or
- * quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Literal Data Type Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadDataTypeCountMapper.class);
- job.setMapOutputKeyClass(NodeWritable.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(NodeCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
- job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a job for computing literal data type counts on RDF triple inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getTripleNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Triples Namespace Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(TripleNamespaceCountMapper.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(TextCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a job for computing literal data type counts on RDF quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getQuadNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Quads Namespace Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadNamespaceCountMapper.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(TextCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(QuadsInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-
- /**
- * Gets a job for computing literal data type counts on RDF triple and/or
- * quad inputs
- *
- * @param config
- * Configuration
- * @param inputPaths
- * Input paths
- * @param outputPath
- * Output path
- * @return Job
- * @throws IOException
- */
- public static Job getNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
- Job job = Job.getInstance(config);
- job.setJarByClass(JobFactory.class);
- job.setJobName("RDF Namespace Usage Count");
-
- // Map/Reduce classes
- job.setMapperClass(QuadNamespaceCountMapper.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(LongWritable.class);
- job.setReducerClass(TextCountReducer.class);
-
- // Input and Output
- job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
- FileOutputFormat.setOutputPath(job, new Path(outputPath));
-
- return job;
- }
-}
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/jena-elephas-common/pom.xml
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/jena-elephas-common/pom.xml b/jena-hadoop-rdf/jena-elephas-common/pom.xml
new file mode 100644
index 0000000..7dd68a0
--- /dev/null
+++ b/jena-hadoop-rdf/jena-elephas-common/pom.xml
@@ -0,0 +1,54 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-elephas</artifactId>
+ <version>0.9.0-SNAPSHOT</version>
+ </parent>
+ <artifactId>jena-elephas-common</artifactId>
+ <name>Apache Jena - Elephas - Common API</name>
+ <description>Common code for RDF on Hadoop such as writable types for RDF primitives</description>
+
+ <!-- Note that versions are managed by parent POMs -->
+ <dependencies>
+ <!-- Hadoop Dependencies -->
+ <!-- Note these will be provided on the Hadoop cluster hence the provided
+ scope -->
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Jena dependencies -->
+ <dependency>
+ <groupId>org.apache.jena</groupId>
+ <artifactId>jena-arq</artifactId>
+ </dependency>
+
+ <!-- Test Dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/jena/blob/a6c0fefc/jena-hadoop-rdf/jena-elephas-common/src/main/java/org/apache/jena/hadoop/rdf/types/AbstractNodeTupleWritable.java
----------------------------------------------------------------------
diff --git a/jena-hadoop-rdf/jena-elephas-common/src/main/java/org/apache/jena/hadoop/rdf/types/AbstractNodeTupleWritable.java b/jena-hadoop-rdf/jena-elephas-common/src/main/java/org/apache/jena/hadoop/rdf/types/AbstractNodeTupleWritable.java
new file mode 100644
index 0000000..f0acc09
--- /dev/null
+++ b/jena-hadoop-rdf/jena-elephas-common/src/main/java/org/apache/jena/hadoop/rdf/types/AbstractNodeTupleWritable.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.hadoop.rdf.types;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableUtils;
+import com.hp.hpl.jena.graph.Node;
+import com.hp.hpl.jena.sparql.util.NodeUtils;
+
+/**
+ * A abstract general purpose writable where the actual class represented is
+ * composed of a number of {@link Node} instances
+ * <p>
+ * The binary encoding of this base implementation is just a variable integer
+ * indicating the number of nodes present followed by the binary encodings of
+ * the {@link NodeWritable} instances. Derived implementations may wish to
+ * override the {@link #readFields(DataInput)} and {@link #write(DataOutput)}
+ * methods in order to use more specialised encodings.
+ * </p>
+ *
+ * @param <T>
+ * Tuple type
+ */
+public abstract class AbstractNodeTupleWritable<T> implements WritableComparable<AbstractNodeTupleWritable<T>> {
+
+ private T tuple;
+
+ /**
+ * Creates a new empty instance
+ */
+ protected AbstractNodeTupleWritable() {
+ this(null);
+ }
+
+ /**
+ * Creates a new instance with the given value
+ *
+ * @param tuple
+ * Tuple value
+ */
+ protected AbstractNodeTupleWritable(T tuple) {
+ this.tuple = tuple;
+ }
+
+ /**
+ * Gets the tuple
+ *
+ * @return Tuple
+ */
+ public T get() {
+ return this.tuple;
+ }
+
+ /**
+ * Sets the tuple
+ *
+ * @param tuple
+ * Tuple
+ */
+ public void set(T tuple) {
+ this.tuple = tuple;
+ }
+
+ @Override
+ public void readFields(DataInput input) throws IOException {
+ // Determine how many nodes
+ int size = WritableUtils.readVInt(input);
+ Node[] ns = new Node[size];
+
+ NodeWritable nw = new NodeWritable();
+ for (int i = 0; i < ns.length; i++) {
+ nw.readFields(input);
+ ns[i] = nw.get();
+ }
+
+ // Load the tuple
+ this.tuple = this.createTuple(ns);
+ }
+
+ /**
+ * Creates the actual tuple type from an array of nodes
+ *
+ * @param ns
+ * Nodes
+ * @return Tuple
+ */
+ protected abstract T createTuple(Node[] ns);
+
+ @Override
+ public void write(DataOutput output) throws IOException {
+ // Determine how many nodes
+ Node[] ns = this.createNodes(this.tuple);
+ WritableUtils.writeVInt(output, ns.length);
+
+ // Write out nodes
+ NodeWritable nw = new NodeWritable();
+ for (int i = 0; i < ns.length; i++) {
+ nw.set(ns[i]);
+ nw.write(output);
+ }
+ }
+
+ /**
+ * Sets the tuple value
+ * <p>
+ * Intended only for internal use i.e. when a derived implementation
+ * overrides {@link #readFields(DataInput)} and needs to set the tuple value
+ * directly i.e. when a derived implementation is using a custom encoding
+ * scheme
+ * </p>
+ *
+ * @param tuple
+ * Tuple
+ */
+ protected final void setInternal(T tuple) {
+ this.tuple = tuple;
+ }
+
+ /**
+ * Converts the actual tuple type into an array of nodes
+ *
+ * @param tuple
+ * Tuples
+ * @return Nodes
+ */
+ protected abstract Node[] createNodes(T tuple);
+
+ /**
+ * Compares instances node by node
+ * <p>
+ * Derived implementations may wish to override this and substitute native
+ * tuple based comparisons
+ * </p>
+ *
+ * @param other
+ * Instance to compare with
+ */
+ @Override
+ public int compareTo(AbstractNodeTupleWritable<T> other) {
+ Node[] ns = this.createNodes(this.tuple);
+ Node[] otherNs = this.createNodes(other.tuple);
+
+ if (ns.length < otherNs.length) {
+ return -1;
+ } else if (ns.length > otherNs.length) {
+ return 1;
+ }
+ // Compare node by node
+ for (int i = 0; i < ns.length; i++) {
+ int c = NodeUtils.compareRDFTerms(ns[i], otherNs[i]);
+ if (c != 0)
+ return c;
+ }
+ return 0;
+ }
+
+ @Override
+ public String toString() {
+ return this.get().toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return this.get().hashCode();
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof AbstractNodeTupleWritable))
+ return false;
+ return this.compareTo((AbstractNodeTupleWritable<T>) other) == 0;
+ }
+}