You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/04/16 11:46:25 UTC

svn commit: r1587844 - in /stanbol/branches/release-0.12/entityhub: generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/util/ query/sparql/ query/sparql/src/main/java/org/apache/stanbol/entityhub/query/sparql/ query/sparql/src/te...

Author: rwesten
Date: Wed Apr 16 09:46:24 2014
New Revision: 1587844

URL: http://svn.apache.org/r1587844
Log:
fix for STANBOL-1277: The utility methods for generating word matchig regex patterns was added to the PatternUtils (entityhub servicesapi). The TextConstraint to SPARQL conversion was adapted to call this new methods. Also case sensitive string matches are now encoded useing regex. Otherwise it would not be possible to match single words.

Modified:
    stanbol/branches/release-0.12/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/util/PatternUtils.java
    stanbol/branches/release-0.12/entityhub/query/sparql/pom.xml
    stanbol/branches/release-0.12/entityhub/query/sparql/src/main/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtils.java
    stanbol/branches/release-0.12/entityhub/query/sparql/src/test/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtilsTest.java

Modified: stanbol/branches/release-0.12/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/util/PatternUtils.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/util/PatternUtils.java?rev=1587844&r1=1587843&r2=1587844&view=diff
==============================================================================
--- stanbol/branches/release-0.12/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/util/PatternUtils.java (original)
+++ stanbol/branches/release-0.12/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/util/PatternUtils.java Wed Apr 16 09:46:24 2014
@@ -34,6 +34,31 @@ public final class PatternUtils {
         if(strict){
             regex.append('^');
         }
+        encodeWildcard(wildcard, regex);
+        if(strict){
+            regex.append('$');
+        }
+        return regex.toString();
+    }
+
+    /**
+     * Converts a Wildcard search string to REGEX matching whole words in 
+     * the text. 
+     * @param wildcard the wildcard pattern
+     * @return the regex pattern for the parsed wildcard
+     * @since 0.12.1
+     */
+    public static String wildcardWordToRegex(String wildcard){
+        StringBuilder regex = new StringBuilder("\\b");
+        encodeWildcard(wildcard, regex);
+        return regex.append("\\b").toString();
+    }
+    /**
+     * Internally used to convert a wildcard to a regex
+     * @param wildcard
+     * @param regex
+     */
+    private static void encodeWildcard(String wildcard, StringBuilder regex) {
         for (char c : wildcard.toCharArray()) {
             switch(c) {
                 case '*':
@@ -52,28 +77,49 @@ public final class PatternUtils {
                     break;
             }
         }
-        if(strict){
-            regex.append('$');
-        }
-        return regex.toString();
     }
+    
     public static String value2Regex(String value){
         return '^'+escapeRegex(value)+'$';
     }
-    public static String escapeRegex(String wildcard){
-        StringBuilder escaped = new StringBuilder();
-        for (char c : wildcard.toCharArray()) {
+    /**
+     * Creates a regex that matches vales against whole words
+     * ('<code>\b{value}\b</code>)
+     * @param word the word to match
+     * @return the regex to match words
+     * @since 0.12.1
+     */
+    public static String word2Regex(String word){
+        return escapeRegex(word, new StringBuilder("\\b")).append("\\b").toString();
+    }
+    
+    public static String escapeRegex(String value){
+        return escapeRegex(value, null).toString();
+    }
+    /**
+     * 
+     * @param value the value to escape
+     * @param sb the {@link StringBuilder} or <code>null</code> if a new 
+     * instance should be created
+     * @return the parsed {@link StringBuilder} with the escaped value added.
+     * @since 0.12.1
+     */
+    public static StringBuilder escapeRegex(String value, StringBuilder sb){
+        if(sb == null){
+            sb = new StringBuilder();
+        }
+        for (char c : value.toCharArray()) {
             switch(c) {
                 case '*': case '?': case '(': case ')': case '[': case ']':
                 case '$': case '^': case '.': case '{': case '}': case '|':
                 case '\\':
-                    escaped.append("\\"); //add the escape char
+                    sb.append("\\"); //add the escape char
                 default:
-                    escaped.append(c); //add the char
+                    sb.append(c); //add the char
                     break;
             }
         }
-        return escaped.toString();
+        return sb;
     }
     public static final Pattern PREFIX_REGEX_PATTERN = Pattern.compile("[\\?\\*]");
     /**

Modified: stanbol/branches/release-0.12/entityhub/query/sparql/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/entityhub/query/sparql/pom.xml?rev=1587844&r1=1587843&r2=1587844&view=diff
==============================================================================
--- stanbol/branches/release-0.12/entityhub/query/sparql/pom.xml (original)
+++ stanbol/branches/release-0.12/entityhub/query/sparql/pom.xml Wed Apr 16 09:46:24 2014
@@ -83,6 +83,11 @@
   <dependencies>
     <dependency>
       <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.entityhub.servicesapi</artifactId>
+      <version>0.12.1-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.entityhub.core</artifactId>
       <version>0.11.0</version>
     </dependency>

Modified: stanbol/branches/release-0.12/entityhub/query/sparql/src/main/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtils.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/entityhub/query/sparql/src/main/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtils.java?rev=1587844&r1=1587843&r2=1587844&view=diff
==============================================================================
--- stanbol/branches/release-0.12/entityhub/query/sparql/src/main/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtils.java (original)
+++ stanbol/branches/release-0.12/entityhub/query/sparql/src/main/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtils.java Wed Apr 16 09:46:24 2014
@@ -23,7 +23,6 @@ import java.math.BigInteger;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -60,7 +59,7 @@ public final class SparqlQueryUtils {
 
     private static final Logger log = LoggerFactory.getLogger(SparqlQueryUtils.class);
 
-    private static final String XSD_DATE_TIME = "http://www.w3.org/2001/XMLSchema#dateTime";
+    //private static final String XSD_DATE_TIME = "http://www.w3.org/2001/XMLSchema#dateTime";
     //private static final DateFormat DATE_FORMAT = new W3CDateFormat();
 
     private SparqlQueryUtils() {}
@@ -757,36 +756,39 @@ public final class SparqlQueryUtils {
                     queryString.append(" \n").append(intend).append("  FILTER(");
                     filterAdded = true;
                     if (constraint.getPatternType() == PatternType.none) {
-                        if (constraint.isCaseSensitive()) {
-                            boolean first = true;
-                            if(constraint.getTexts().size() > 1){
-                                queryString.append('('); //start language filter group (STANBOL-1204)
-                            }
-                            for (String textConstraint : constraint.getTexts()) {
-                                if (first) {
-                                    first = false;
-                                } else {
-                                    queryString.append(" || ");
-                                }
-                                if (textConstraint != null && !textConstraint.isEmpty()) {
-                                    queryString.append("(str(").append(var).append(") = \"");
-                                    addGrammarEscapedValue(queryString, textConstraint);
-                                    queryString.append("\")");
-                                }
-                            }
-                            if(constraint.getTexts().size() > 1){
-                                queryString.append(')'); //end language filter group (STANBOL-1204)
-                            }
-                        } else {
-                            Collection<String> regexQueryTexts = new ArrayList<String>(
-                                    constraint.getTexts().size());
-                            for (String textConstraint : constraint.getTexts()) {
-                                if (textConstraint != null && !textConstraint.isEmpty()) {
-                                    regexQueryTexts.add(PatternUtils.value2Regex(textConstraint));
-                                }
+                        //as we want to match also single words in labels
+                        //we need also to use regex instead of string matching
+                        //in case of case sensitive matches (STANBOL-1277)
+//                        if (constraint.isCaseSensitive()) {
+//                            boolean first = true;
+//                            if(constraint.getTexts().size() > 1){
+//                                queryString.append('('); //start language filter group (STANBOL-1204)
+//                            }
+//                            for (String textConstraint : constraint.getTexts()) {
+//                                if (first) {
+//                                    first = false;
+//                                } else {
+//                                    queryString.append(" || ");
+//                                }
+//                                if (textConstraint != null && !textConstraint.isEmpty()) {
+//                                    queryString.append("(str(").append(var).append(") = \"");
+//                                    addGrammarEscapedValue(queryString, textConstraint);
+//                                    queryString.append("\")");
+//                                }
+//                            }
+//                            if(constraint.getTexts().size() > 1){
+//                                queryString.append(')'); //end language filter group (STANBOL-1204)
+//                            }
+//                        } else {
+                        Collection<String> regexQueryTexts = new ArrayList<String>(
+                                constraint.getTexts().size());
+                        for (String textConstraint : constraint.getTexts()) {
+                            if (textConstraint != null && !textConstraint.isEmpty()) {
+                                regexQueryTexts.add(PatternUtils.word2Regex(textConstraint));
                             }
-                            addRegexFilter(queryString, var, regexQueryTexts, constraint.isCaseSensitive());
                         }
+                        addRegexFilter(queryString, var, regexQueryTexts, constraint.isCaseSensitive());
+//                        }
                     } else if (constraint.getPatternType() == PatternType.wildcard) {
                         // parse false, because that is more in line with the
                         // expectations of users!
@@ -794,7 +796,7 @@ public final class SparqlQueryUtils {
                                 .size());
                         for (String textConstraint : constraint.getTexts()) {
                             if (textConstraint != null && !textConstraint.isEmpty()) {
-                                regexQueryTexts.add(PatternUtils.wildcardToRegex(textConstraint, false));
+                                regexQueryTexts.add(PatternUtils.wildcardWordToRegex(textConstraint));
                             }
                         }
                         addRegexFilter(queryString, var, regexQueryTexts, constraint.isCaseSensitive());

Modified: stanbol/branches/release-0.12/entityhub/query/sparql/src/test/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtilsTest.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/entityhub/query/sparql/src/test/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtilsTest.java?rev=1587844&r1=1587843&r2=1587844&view=diff
==============================================================================
--- stanbol/branches/release-0.12/entityhub/query/sparql/src/test/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtilsTest.java (original)
+++ stanbol/branches/release-0.12/entityhub/query/sparql/src/test/java/org/apache/stanbol/entityhub/query/sparql/SparqlQueryUtilsTest.java Wed Apr 16 09:46:24 2014
@@ -33,6 +33,7 @@ import org.apache.stanbol.entityhub.serv
 import org.apache.stanbol.entityhub.servicesapi.query.FieldQueryFactory;
 import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
 import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint.PatternType;
+import org.junit.Assert;
 import org.junit.Test;
 
 public class SparqlQueryUtilsTest {
@@ -104,6 +105,34 @@ public class SparqlQueryUtilsTest {
     	assertTrue(queryRegex.contains(testString.replaceAll("\\\"", "\\\\\"")));
     }
 
-	
+    /**
+     * Tests word level matching for {@link TextConstraint}s (STANBOL-1277)
+     */
+    @Test
+	public void testMultiWordTextConstraints(){
+        //queries for a TextConstraint with {text1} or {text2} in the languages
+        // {lang1} or {lang2} are expected to look like:
+        //
+        //    select ?entity, ?label where {
+        //        ?entity rdfs:label ?label
+        //        FILTER((regex(str(?label),"\\b{text1}\\b","i") || regex(str(?label),"\\b{text2}\\b","i")) 
+        //            && ((lang(?label) = "{lang1}") || (lang(?label) = "{lang2}"))) . 
+        //    }
+        
+        //first test a pattern type NONE
+        SparqlFieldQuery query = SparqlFieldQueryFactory.getInstance().createFieldQuery();
+        query.setConstraint("urn:field4", new TextConstraint(Arrays.asList("Global","Toy"), PatternType.none, false, "en", null));
+        String queryString = SparqlQueryUtils.createSparqlSelectQuery(query, true, 0, SparqlEndpointTypeEnum.Standard);
+        Assert.assertTrue(queryString.contains("regex(str(?tmp1),\"\\\\bGlobal\\\\b\",\"i\") "
+            + "|| regex(str(?tmp1),\"\\\\bToy\\\\b\",\"i\")"));
+
+        //also test for pattern type WILDCARD
+        query = SparqlFieldQueryFactory.getInstance().createFieldQuery();
+        query.setConstraint("urn:field4", new TextConstraint(Arrays.asList("Glo?al","Toy"), PatternType.wildcard, false, "en", null));
+        queryString = SparqlQueryUtils.createSparqlSelectQuery(query, true, 0, SparqlEndpointTypeEnum.Standard);
+        Assert.assertTrue(queryString.contains("regex(str(?tmp1),\"\\\\bGlo.al\\\\b\",\"i\") "
+            + "|| regex(str(?tmp1),\"\\\\bToy\\\\b\",\"i\")"));
+
+    }
 
 }