You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/09/26 18:03:52 UTC

svn commit: r1175921 - in /incubator/stanbol/trunk: commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/ entityhub/query/clerezza/src/main/java/org/apache/stanbol/ent...

Author: rwesten
Date: Mon Sep 26 16:03:51 2011
New Revision: 1175921

URL: http://svn.apache.org/viewvc?rev=1175921&view=rev
Log:
Fixes STANBOL-330: Full Text Search tokens that do not contain a single Alpha-Numeric chars are now ignored

other changes:

* Optimised POS tag set for swedish
* added "default Language" property information to the metatype.properties of the KeywordLinkingEngine

Modified:
    incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
    incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
    incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java

Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java Mon Sep 26 16:03:51 2011
@@ -141,10 +141,11 @@ public enum PosTagsCollectionEnum {
      * NOTE: <ul>
      * <li> This includes all typical noun categories as defined by MAMBA
      * <li> Unclassifiable part-of-speech and
-     * <li> Numerical ("RO" and "EN") 
+     * <li> Numerical "RO"
+     * <li> EN is excluded 
      * </ul>
      */
-    SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","EN","RO"),
+    SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","RO"),
     /**
      * POS types for Verbs of the Swedish language based on the
      * <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Sep 26 16:03:51 2011
@@ -54,3 +54,6 @@ org.apache.stanbol.enhancer.engines.keyw
 
 org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.name=Languages
 org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.description=Languages to process. An empty text indicates that all languages are processed. Use ',' as separator for languages (e.g. 'en,de' to enhance only English and German texts).
+
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.name=Default Matching Language
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.description=The language used in addition to the language detected for the analysed text to search for Entities. Typically this configuration is an empty string to search for labels without any language defined, but for some data sets (such as DBpedia.org) that add languages to any labels it might improve resuls to change this configuration (e.g. to 'en' in the case of DBpedia.org).

Modified: incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java (original)
+++ incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java Mon Sep 26 16:03:51 2011
@@ -32,6 +32,7 @@ import org.apache.clerezza.rdf.core.Trip
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.util.W3CDateFormat;
+import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.entityhub.core.utils.AdaptingIterator;
 import org.apache.stanbol.entityhub.model.clerezza.RdfRepresentation;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
@@ -687,7 +688,14 @@ public final class SparqlQueryUtils {
                 boolean firstAndWord = true;
                 for(String word : words){
                     word = word.trim();
-                    if(!word.isEmpty()){
+                    boolean hasAlphaNumeric = false;
+                    for(int i = 0; i < word.length() && !hasAlphaNumeric;i++){
+                        char ch = word.charAt(i);
+                        if(Character.isLetter(ch) || Character.isDigit(ch)){
+                            hasAlphaNumeric = true;
+                        }
+                    }
+                    if(hasAlphaNumeric){
                         if(firstAndWord){
                             firstAndWord = false;
                         } else {

Modified: incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java (original)
+++ incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java Mon Sep 26 16:03:51 2011
@@ -12,14 +12,21 @@ public class SparqlQueryUtilsTest {
     @Test
     public void testCreateFullTextQueryString() {
         List<String> keywords = Arrays.asList("test", "keyword");
-        assertEquals("\"test\" OR \"keyword\"", SparqlQueryUtils.createFullTextQueryString(keywords));
+        assertEquals("\"test\" OR \"keyword\"", 
+            SparqlQueryUtils.createFullTextQueryString(keywords));
 
         keywords = Arrays.asList("test keyword");
-        assertEquals("(\"test\" AND \"keyword\")", SparqlQueryUtils.createFullTextQueryString(keywords));
+        assertEquals("(\"test\" AND \"keyword\")", 
+            SparqlQueryUtils.createFullTextQueryString(keywords));
 
         keywords = Arrays.asList("'test' \"keyword\"");
         assertEquals("(\"'test'\" AND \"\\\"keyword\\\"\")",
             SparqlQueryUtils.createFullTextQueryString(keywords));
+        
+        keywords = Arrays.asList("1 Alpha ? Numeric Test .");
+        assertEquals("(\"1\" AND \"Alpha\" AND \"Numeric\" AND \"Test\")",
+            SparqlQueryUtils.createFullTextQueryString(keywords));
+        
     }
 
 }