You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/09/26 18:03:52 UTC
svn commit: r1175921 - in /incubator/stanbol/trunk:
commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/
enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/
entityhub/query/clerezza/src/main/java/org/apache/stanbol/ent...
Author: rwesten
Date: Mon Sep 26 16:03:51 2011
New Revision: 1175921
URL: http://svn.apache.org/viewvc?rev=1175921&view=rev
Log:
Fixes STANBOL-330: Full Text Search tokens that do not contain a single Alpha-Numeric chars are now ignored
other changes:
* Optimised POS tag set for swedish
* added "default Language" property information to the metatype.properties of the KeywordLinkingEngine
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java Mon Sep 26 16:03:51 2011
@@ -141,10 +141,11 @@ public enum PosTagsCollectionEnum {
* NOTE: <ul>
* <li> This includes all typical noun categories as defined by MAMBA
* <li> Unclassifiable part-of-speech and
- * <li> Numerical ("RO" and "EN")
+ * <li> Numerical "RO"
+ * <li> EN is excluded
* </ul>
*/
- SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","EN","RO"),
+ SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","RO"),
/**
* POS types for Verbs of the Swedish language based on the
* <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Sep 26 16:03:51 2011
@@ -54,3 +54,6 @@ org.apache.stanbol.enhancer.engines.keyw
org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.name=Languages
org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.description=Languages to process. An empty text indicates that all languages are processed. Use ',' as separator for languages (e.g. 'en,de' to enhance only English and German texts).
+
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.name=Default Matching Language
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.description=The language used in addition to the language detected for the analysed text to search for Entities. Typically this configuration is an empty string to search for labels without any language defined, but for some data sets (such as DBpedia.org) that add languages to any labels it might improve resuls to change this configuration (e.g. to 'en' in the case of DBpedia.org).
Modified: incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java (original)
+++ incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java Mon Sep 26 16:03:51 2011
@@ -32,6 +32,7 @@ import org.apache.clerezza.rdf.core.Trip
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.util.W3CDateFormat;
+import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.entityhub.core.utils.AdaptingIterator;
import org.apache.stanbol.entityhub.model.clerezza.RdfRepresentation;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
@@ -687,7 +688,14 @@ public final class SparqlQueryUtils {
boolean firstAndWord = true;
for(String word : words){
word = word.trim();
- if(!word.isEmpty()){
+ boolean hasAlphaNumeric = false;
+ for(int i = 0; i < word.length() && !hasAlphaNumeric;i++){
+ char ch = word.charAt(i);
+ if(Character.isLetter(ch) || Character.isDigit(ch)){
+ hasAlphaNumeric = true;
+ }
+ }
+ if(hasAlphaNumeric){
if(firstAndWord){
firstAndWord = false;
} else {
Modified: incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java (original)
+++ incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java Mon Sep 26 16:03:51 2011
@@ -12,14 +12,21 @@ public class SparqlQueryUtilsTest {
@Test
public void testCreateFullTextQueryString() {
List<String> keywords = Arrays.asList("test", "keyword");
- assertEquals("\"test\" OR \"keyword\"", SparqlQueryUtils.createFullTextQueryString(keywords));
+ assertEquals("\"test\" OR \"keyword\"",
+ SparqlQueryUtils.createFullTextQueryString(keywords));
keywords = Arrays.asList("test keyword");
- assertEquals("(\"test\" AND \"keyword\")", SparqlQueryUtils.createFullTextQueryString(keywords));
+ assertEquals("(\"test\" AND \"keyword\")",
+ SparqlQueryUtils.createFullTextQueryString(keywords));
keywords = Arrays.asList("'test' \"keyword\"");
assertEquals("(\"'test'\" AND \"\\\"keyword\\\"\")",
SparqlQueryUtils.createFullTextQueryString(keywords));
+
+ keywords = Arrays.asList("1 Alpha ? Numeric Test .");
+ assertEquals("(\"1\" AND \"Alpha\" AND \"Numeric\" AND \"Test\")",
+ SparqlQueryUtils.createFullTextQueryString(keywords));
+
}
}