You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/06/07 12:37:11 UTC
svn commit: r1490574 - in /stanbol/trunk/commons/indexedgraph/src:
main/java/org/apache/stanbol/commons/indexedgraph/IndexedTripleCollection.java
test/java/org/apache/stanbol/commons/indexedgraph/IndexedGraphTest.java
Author: rwesten
Date: Fri Jun 7 10:37:11 2013
New Revision: 1490574
URL: http://svn.apache.org/r1490574
Log:
CLEREZZA-683: re-implementation of the Comparator for the indexed in-memory graph to support different Clerezza Resource implementation. The new comparator is about 100% solver for updates. For reads (e.g. the filter() method) there is no mesureable difference. See the according comment on the CLEREZZA-683 Issue for details.
Modified:
stanbol/trunk/commons/indexedgraph/src/main/java/org/apache/stanbol/commons/indexedgraph/IndexedTripleCollection.java
stanbol/trunk/commons/indexedgraph/src/test/java/org/apache/stanbol/commons/indexedgraph/IndexedGraphTest.java
Modified: stanbol/trunk/commons/indexedgraph/src/main/java/org/apache/stanbol/commons/indexedgraph/IndexedTripleCollection.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/commons/indexedgraph/src/main/java/org/apache/stanbol/commons/indexedgraph/IndexedTripleCollection.java?rev=1490574&r1=1490573&r2=1490574&view=diff
==============================================================================
--- stanbol/trunk/commons/indexedgraph/src/main/java/org/apache/stanbol/commons/indexedgraph/IndexedTripleCollection.java (original)
+++ stanbol/trunk/commons/indexedgraph/src/main/java/org/apache/stanbol/commons/indexedgraph/IndexedTripleCollection.java Fri Jun 7 10:37:11 2013
@@ -19,19 +19,26 @@ package org.apache.stanbol.commons.index
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
+import java.util.EnumSet;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
+import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.clerezza.rdf.core.BNode;
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.NonLiteral;
+import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.TripleCollection;
+import org.apache.clerezza.rdf.core.TypedLiteral;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.AbstractTripleCollection;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
@@ -162,8 +169,10 @@ class IndexedTripleCollection extends Ab
public IndexedTripleCollection(Collection<Triple> baseCollection) {
super();
spo.addAll(baseCollection);
- pos.addAll(baseCollection);
- osp.addAll(baseCollection);
+ //use internal index to fill the other indexes, because the parsed
+ //collection might be slow
+ pos.addAll(spo);
+ osp.addAll(spo);
}
@Override
@@ -271,65 +280,187 @@ class IndexedTripleCollection extends Ab
return Integer.MAX_VALUE;
};
};
-
+
+// /**
+// * Compares two resources with special support for {@link #MIN} and
+// * {@link #MAX} to allow building {@link SortedSet#subSet(Object, Object)}
+// * for <code>null</code> values parsed to
+// * {@link #filter(NonLiteral, UriRef, Resource)}
+// * @param a
+// * @param b
+// * @return
+// */
+// protected static int compareHash(Resource a, Resource b, Map<Integer,List<Resource>> confictsMap) {
+// int hashA = a.hashCode();
+// int hashB = b.hashCode();
+// if (hashA != hashB) {
+// return hashA > hashB ? 1 : -1;
+// }
+// //those resources might be equals
+// //(1) Check for MIN, MAX (used to build sub-sets). Other resources might
+// // have a similar hasCode
+// int state = a == MIN || b == MAX ? -1 :
+// a == MAX || b == MIN ? 1 : 0;
+// if(state == 0){
+// if(a.equals(b)){ //check of the resources are equals
+// return 0; //return zero
+// } else if(//we need to care about HashCode conflicts
+// a instanceof BNode && b instanceof BNode){ // of BNodes
+// log.info("HashCode conflict for {} and {}",a,b); //we have a conflict
+// return resolveBNodeHashConflict(a, b, confictsMap);
+// } else { //same hashCode but not equals
+// //use the String representation of the Resources to sort them
+// String as = resource2String(a);
+// String bs = resource2String(b);
+// log.info("same hash code {} - compare Strings a: {}, b: {}",
+// new Object[]{a.hashCode(),as,bs});
+// return as.compareTo(bs);
+// }
+// }
+// return state;
+// }
/**
- * Compares two resources with special support for {@link #MIN} and
- * {@link #MAX} to allow building {@link SortedSet#subSet(Object, Object)}
- * for <code>null</code> values parsed to
- * {@link #filter(NonLiteral, UriRef, Resource)}
- * @param a
- * @param b
- * @return
- */
- protected static int compare(Resource a, Resource b, Map<Integer,List<Resource>> confictsMap) {
- int hashA = a.hashCode();
- int hashB = b.hashCode();
- if (hashA != hashB) {
- return hashA > hashB ? 1 : -1;
- }
- int state = a == MIN || b == MAX ? -1 :
- a == MAX || b == MIN ? 1 : a.toString().compareTo(b.toString());
- if(state == 0 && //same string representation -> should be equals
- a instanceof BNode && //but for BNodes it might be a hashcode conflict
- !a.equals(b)){ //so check if they are not equals
- log.info("HashCode conflict for {} and {}",a,b); //we have a conflict
- //This is not a bad thing. We need just to ensure constant ordering
- //and as there is nothing we can use to distinguish we need to keep
- //this information in a list.
- Integer hash = Integer.valueOf(a.hashCode());
- List<Resource> resources = confictsMap.get(hash);
- if(resources == null){ //new conflict ... just add and return
- resources = new ArrayList<Resource>(2);
- confictsMap.put(hash, resources);
- resources.add(a);
- resources.add(b);
- return -1;
+ * Resolved BNode hasConflics, by storing the correct order for the affected
+ * {@link Integer} in a {@link List} of Resource instances.
+ * @param a the first {@link BNode}
+ * @param b the second {@link BNode}
+ * @param confictsMap the Map used to store the order of BNodes with conflicts
+ * @return the decision taken based on the confictsMap.
+ */
+ private static int resolveBNodeHashConflict(Resource a, Resource b,
+ Map<Integer,List<Resource>> confictsMap) {
+ //This is not a bad thing. We need just to ensure constant ordering
+ //and as there is nothing we can use to distinguish we need to keep
+ //this information in a list.
+ Integer hash = Integer.valueOf(a.hashCode());
+ List<Resource> resources = confictsMap.get(hash);
+ if(resources == null){ //new conflict ... just add and return
+ resources = new ArrayList<Resource>(2);
+ confictsMap.put(hash, resources);
+ resources.add(a);
+ resources.add(b);
+ return -1;
+ }
+ //already conflicting resource for this hash present
+ int aIndex=-1;
+ int bIndex=-1;
+ for(int i = 0; i<resources.size() && (aIndex < 0 || bIndex < 0);i++){
+ Resource r = resources.get(i);
+ if(aIndex < 0 && r.equals(a)){
+ aIndex = i;
}
- //already conflicting resource for this hash present
- int aIndex=-1;
- int bIndex=-1;
- for(int i = 0; i<resources.size() && (aIndex < 0 || bIndex < 0);i++){
- Resource r = resources.get(i);
- if(aIndex < 0 && r.equals(a)){
- aIndex = i;
+ if(bIndex < 0 && r.equals(b)){
+ bIndex = i;
+ }
+ }
+ if(aIndex < 0){ //a not found
+ aIndex = resources.size();
+ resources.add(a);
+ }
+ if(bIndex < 0){ //b not found
+ bIndex = resources.size();
+ resources.add(b);
+ }
+ return aIndex < bIndex ? -1 : 1;
+ }
+ /**
+ * Compares Resources to correctly sort them within the index.<p>
+ * Sort criteria are:<ol>
+ * <li> URIs are sorted by the {@link UriRef#getUnicodeString()} unicode string)
+ * <li> Literals
+ * <ol>
+ * <li> sort by the {@link Literal#getLexicalForm() lixical form}
+ * <li> sort by {@link PlainLiteral#getLanguage() language} (<code>null</code> value first)
+ * <li> sort by {@link TypedLiteral#getDataType() type} (<code>null</code> value fist
+ * </ol>
+ * <li> BNode
+ * <ol>
+ * <li> sorted by their {@link System#identityHashCode(Object) Object hasCode}
+ * <li> on hasCode conflicts (same hasCode but not equals) a random order is chosen
+ * and kept in the parsed conflictsMap
+ * </ol>
+ * </ol>
+ * <b>NOTEs</b><ul>
+ * <li> parsed {@link Resource} are not required to correctly implement
+ * {@link Object#hashCode() hashCode} and {@link Object#equals(Object) equals}
+ * <li> parsed {@link UriRef} and {@link BNode} and {@link Literal} MUST NOT
+ * extend/implement any of the other classes/interfaces. This means that an
+ * {@link UriRef} MUST NOT implement {@link BNode} nor {@link Literal}
+ * <li> parsed {@link Literal}s MAY implement {@link PlainLiteral} AND
+ * {@link TypedLiteral}. This allows wrappers over frameworks that do not
+ * distinguish between those two literal types to be used with the
+ * {@link IndexedTripleCollection}.
+ * </ul>
+ *
+ * @param a the first resource to compare
+ * @param b the second resource to compare
+ * @param confictsMap the map used to resolve BNodes with hasCode conflicts
+ * @return
+ */
+ protected static int compare(Resource a, Resource b, Map<Integer,List<Resource>> confictsMap){
+ //Handle special cases for MAX and MIN values
+ if(a == MIN || b == MAX) {
+ return -1 ;
+ } else if(a == MAX || b == MIN){
+ return 1;
+ }
+ //sort (0) UriRefs < (1) Literals (PlainLiterals & TypedLiterals) < (3) BNodes
+ int at = a instanceof UriRef ? 0 : a instanceof Literal ? 1 : 2;
+ int bt = b instanceof UriRef ? 0 : b instanceof Literal ? 1 : 2;
+ if(at == bt){ //same type sort the different types
+ if(at < 2){ //no BNode
+ //sort in alphabetic order of the string representation
+ String as = at == 0 ? ((UriRef)a).getUnicodeString() :
+ ((Literal)a).getLexicalForm();
+ String bs = bt == 0 ? ((UriRef)b).getUnicodeString() :
+ ((Literal)b).getLexicalForm();
+ int sc = as.compareTo(bs);
+ if(sc == 0 && at == 1){ //same string value and Literals
+ //check if the language and types are the same
+ Language al = a instanceof PlainLiteral ? ((PlainLiteral)a).getLanguage() : null;
+ Language bl = b instanceof PlainLiteral ? ((PlainLiteral)b).getLanguage() : null;
+ //first try to sort by language
+ if(al == null){
+ sc = bl == null ? 0 : -1;
+ } else if(bl == null){
+ sc = 1;
+ } else {
+ sc = al.toString().compareTo(bl.toString());
+ }
+ if(sc == 0){
+ //if still equals look at the dataType
+ UriRef adt = a instanceof TypedLiteral ? ((TypedLiteral)a).getDataType() : null;
+ UriRef bdt = b instanceof TypedLiteral ? ((TypedLiteral)b).getDataType() : null;
+ if(adt == null){
+ sc = bdt == null ? 0 : -1;
+ } else if(bdt == null){
+ sc = 1;
+ } else {
+ sc = adt.getUnicodeString().compareTo(bdt.getUnicodeString());
+ }
+ }
+ return sc;
+ } else { //for UriRefs return the string compare
+ return sc;
}
- if(bIndex < 0 && r.equals(b)){
- bIndex = i;
+ } else { //handle BNodes
+ //sort BNodes based on hashCode
+ int ah = System.identityHashCode(a);
+ int bh = System.identityHashCode(b);
+ if(ah == bh){
+ if(!a.equals(b)){
+ return resolveBNodeHashConflict(a, b, confictsMap);
+ } else { //same hash and equals
+ return 0;
+ }
+ } else { //sort by hash
+ return ah < bh ? -1 : 1;
}
}
- if(aIndex < 0){ //a not found
- aIndex = resources.size();
- resources.add(a);
- }
- if(bIndex < 0){ //b not found
- bIndex = resources.size();
- resources.add(b);
- }
- return aIndex < bIndex ? -1 : 1;
- } //no conflict or no conflict possible
- return state;
+ } else {
+ return at < bt ? -1 : 1;
+ }
}
-
}
Modified: stanbol/trunk/commons/indexedgraph/src/test/java/org/apache/stanbol/commons/indexedgraph/IndexedGraphTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/commons/indexedgraph/src/test/java/org/apache/stanbol/commons/indexedgraph/IndexedGraphTest.java?rev=1490574&r1=1490573&r2=1490574&view=diff
==============================================================================
--- stanbol/trunk/commons/indexedgraph/src/test/java/org/apache/stanbol/commons/indexedgraph/IndexedGraphTest.java (original)
+++ stanbol/trunk/commons/indexedgraph/src/test/java/org/apache/stanbol/commons/indexedgraph/IndexedGraphTest.java Fri Jun 7 10:37:11 2013
@@ -17,8 +17,10 @@
package org.apache.stanbol.commons.indexedgraph;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Collections;
import java.util.ConcurrentModificationException;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
@@ -173,14 +175,19 @@ public class IndexedGraphTest extends M
@Test
public void testPerformance(){
//Reduced values to fix STANBOL-
- MGraph sg = new SimpleMGraph();
+ Set<Triple> graph = new HashSet<Triple>();
int iterations = 100; //reduced from 1000
int graphsize = 100000;
Long seed = System.currentTimeMillis();
log.info("Test Seed: {}",seed);
- createGraph(sg, graphsize, seed);
- MGraph ig = new IndexedMGraph(sg);
- long start;
+ createGraph(graph, graphsize, seed);
+ log.info("Load Time ({} triples)", graph.size());
+ long start = System.currentTimeMillis();
+ MGraph sg = new SimpleMGraph(graph);
+ log.info(" ... {}: {}",sg.getClass().getSimpleName(), System.currentTimeMillis()-start);
+ start = System.currentTimeMillis();
+ MGraph ig = new IndexedMGraph(graph);
+ log.info(" ... {}: {}",ig.getClass().getSimpleName(), System.currentTimeMillis()-start);
//Simple Graph reference test
TestCase testCase = new TestCase(sg, 20, 5, 20); //reduced form 100,5,100
log.info("Filter Performance Test (graph size {} triples, iterations {})",graphsize,iterations);
@@ -365,7 +372,7 @@ public class IndexedGraphTest extends M
return count;
}
- private static void createGraph(TripleCollection tc, int triples, Long seed){
+ private static void createGraph(Collection<Triple> tc, int triples, Long seed){
Random rnd = new Random();
if(seed != null){
rnd.setSeed(seed);