You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/06/30 14:00:31 UTC

svn commit: r1141496 - in /incubator/stanbol/trunk: commons/stanboltools/bundledatafileprovider/src/main/java/org/apache/stanbol/commons/stanboltools/datafileprovider/bundle/impl/ entityhub/indexing/dblp/src/main/resources/indexing/config/dblp/ entityh...

Author: rwesten
Date: Thu Jun 30 12:00:30 2011
New Revision: 1141496

URL: http://svn.apache.org/viewvc?rev=1141496&view=rev
Log:
### SolrYard:

* Added omitNorms="false" to all fieldTypes, fields and dynamicField definitions of the solr schema.xml that are used for text values. This is needed to correctly process boosts at indexing time.
* The DefaultDirectoryManager no longer sets the BundleSymbolicName when asking for Solr index archives via the DataFileProvider. This allows to load indexArchives from other locations as DataFileProviders provided by the SolrYard bundle and the MainDataFileProvider. This is e.g. needed to load the default index for dbpedia form the defaultdata bundle by using the BundleDataFileProvider.
* The Classpath based DataFileProvider used by the SolrYard bundle now supports null as BundleSymbolicName
* some changes to the implementation for boosts in the SolrYard but no change in functionality

### Other changes

* removed unused directories from the dblp indexing tool
* updated the schema for indexing dbpedia to include the omitNorms="false" attributes as mentioned above
* fixed a bug with LIMIT in the SparqlQueryUtils

Removed:
    incubator/stanbol/trunk/entityhub/indexing/dblp/src/main/resources/indexing/config/dblp/
Modified:
    incubator/stanbol/trunk/commons/stanboltools/bundledatafileprovider/src/main/java/org/apache/stanbol/commons/stanboltools/datafileprovider/bundle/impl/BundleDataFileProvider.java
    incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml
    incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
    incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/ClassPathSolrIndexConfigProvider.java
    incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/DefaultSolrDirectoryManager.java
    incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java
    incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip
    incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip

Modified: incubator/stanbol/trunk/commons/stanboltools/bundledatafileprovider/src/main/java/org/apache/stanbol/commons/stanboltools/datafileprovider/bundle/impl/BundleDataFileProvider.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/stanboltools/bundledatafileprovider/src/main/java/org/apache/stanbol/commons/stanboltools/datafileprovider/bundle/impl/BundleDataFileProvider.java?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/stanboltools/bundledatafileprovider/src/main/java/org/apache/stanbol/commons/stanboltools/datafileprovider/bundle/impl/BundleDataFileProvider.java (original)
+++ incubator/stanbol/trunk/commons/stanboltools/bundledatafileprovider/src/main/java/org/apache/stanbol/commons/stanboltools/datafileprovider/bundle/impl/BundleDataFileProvider.java Thu Jun 30 12:00:30 2011
@@ -102,12 +102,12 @@ public class BundleDataFileProvider impl
         
         URL resource = null;
         Iterator<String> relativePathIterator = searchPaths.iterator();
-        while(resource == null){
+        while(resource == null && relativePathIterator.hasNext()){
             String path = relativePathIterator.next();
             final String resourceName = path != null ? path + filename : filename ;
             resource = bundle.getEntry(resourceName);
-            log.info("Resource {} found: {}", (resource == null ? "NOT" : ""), resourceName);
         }
+        log.info("Resource {} found: {}", (resource == null ? "NOT" : ""), filename);
         return resource != null ? resource.openStream() : null;
     }
     /**

Modified: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml (original)
+++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml Thu Jun 30 12:00:30 2011
@@ -51,7 +51,7 @@
       unsupported data types ...
     -->
     <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
-    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>    
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="false"/>    
 
     <!-- 
       This can be used as alternative to "string" to enable case insensitive
@@ -151,7 +151,7 @@
         form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
         to generate text:"pdp 11" rather than (text:PDP OR text:11).
         NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
-    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" omitNorms="false">
       <analyzer type="index">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
@@ -177,7 +177,7 @@
          
          Less flexible matching, but less false matches.  Probably not ideal for product names,
          but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
-    <fieldType name="text_en_Tight" class="solr.TextField" positionIncrementGap="100" >
+    <fieldType name="text_en_Tight" class="solr.TextField" positionIncrementGap="100" omitNorms="false">
       <analyzer>
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
@@ -197,7 +197,7 @@
          The default for any language without a special field definition.
 
          A general unstemmed text field - good if one does not know the language of the field -->
-    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
+    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100" omitNorms="false">
       <analyzer type="index">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
@@ -219,7 +219,7 @@
 	 leading wildcard queries.
      
      Not used 
-    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
+    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100" omitNorms="false">
       <analyzer type="index">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
@@ -244,7 +244,7 @@
           - Add 0-9 to the regex patter to preserve numbers
          
       -->
-    <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
+    <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="false">
       <analyzer>
         <!-- KeywordTokenizer does not tokenize -->
         <tokenizer class="solr.KeywordTokenizerFactory"/>
@@ -262,7 +262,7 @@
       </analyzer>
     </fieldtype> -->
 
-    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100" omitNorms="false">
       <analyzer>
         <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
       </analyzer>
@@ -325,10 +325,10 @@
    <field name="cal/dbp-ont:deathDate/" type="tdate" indexed="true" stored="true" multiValued="true"/>
 
    <!-- Do index, but not store abstracts -->
-   <field name="@en/dbp-ont:abstract/"  type="text_en_Tight"  indexed="true" stored="false" multiValued="true" termVectors="true"/>
-   <field name="@de/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" termVectors="true"/>
-   <field name="@it/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" termVectors="true"/>
-   <field name="@fr/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" termVectors="true"/>
+   <field name="@en/dbp-ont:abstract/"  type="text_en_Tight"  indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
+   <field name="@de/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
+   <field name="@it/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
+   <field name="@fr/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
 
 
    <!-- 
@@ -367,7 +367,7 @@
      To support case insensitive searches in such fields change 
      the type to "lowercase"
    -->
-   <dynamicField name="str/*"  type="string"  indexed="true"  stored="true" multiValued="true"/>
+   <dynamicField name="str/*"  type="string"  indexed="true"  stored="true" multiValued="true" omitNorms="false"/>
    <!-- 
      references are values that represent IDs of other resources.
      Typically this will store URIs but in principle also other IDs
@@ -402,12 +402,12 @@
      Dynamic field for english languages.
      Note that the prefix "@en*" matches also "@en-GB" and "@en-US"
    -->
-   <dynamicField name="@en*"  type="text_en_Tight" indexed="true" stored="true" multiValued="true"/>
+   <dynamicField name="@en*"  type="text_en_Tight" indexed="true" stored="true" multiValued="true" omitNorms="false"/>
    <!-- 
      The "@*" catches all the other languages including "@/" 
      (default language) used for texts without a defined language
    -->
-   <dynamicField name="@*"  type="textgen"  indexed="true"  stored="true" multiValued="true"/>
+   <dynamicField name="@*"  type="textgen"  indexed="true"  stored="true" multiValued="true" omitNorms="false"/>
 
    <!--
      To add special configurations for specific fields one
@@ -450,8 +450,7 @@
      This field need not to be stored. The type can be changed to alternatives
      as described in the types section of this configuration.
    -->
-   <dynamicField name="_!@*"  type="textgen"  indexed="true"  stored="false"
-     multiValued="true" />
+   <dynamicField name="_!@*"  type="textgen"  indexed="true"  stored="false" multiValued="true" omitNorms="false"/>
    <!-- 
      fields starting with "_config/" are used to store configurations about how the
      index was created within the index (e.g. used namespace prefixes).

Modified: incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java (original)
+++ incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java Thu Jun 30 12:00:30 2011
@@ -347,8 +347,8 @@ public final class SparqlQueryUtils {
      * @param query
      * @param queryString
      */
-    private static void addLimit(int limit, final StringBuilder queryString) {
-        if(limit > 0){
+    private static void addLimit(Integer limit, final StringBuilder queryString) {
+        if(limit != null && limit > 0){
             queryString.append(String.format("LIMIT %d \n", limit));
         }
     }

Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/ClassPathSolrIndexConfigProvider.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/ClassPathSolrIndexConfigProvider.java?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/ClassPathSolrIndexConfigProvider.java (original)
+++ incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/ClassPathSolrIndexConfigProvider.java Thu Jun 30 12:00:30 2011
@@ -31,9 +31,9 @@ public class ClassPathSolrIndexConfigPro
     public InputStream getInputStream(String bundleSymbolicName,
             String filename, Map<String, String> comments) 
     throws IOException {
-        //if the symbolicName is null accept any request
+        //if the parsed bundleSymbolicName is null accept any request
         //if not, than check if the request is from the correct bundle.
-        if(symbolicName != null && !symbolicName.equals(bundleSymbolicName)) {
+        if(bundleSymbolicName != null && !bundleSymbolicName.equals(bundleSymbolicName)) {
             log.debug("Requested bundleSymbolicName {} does not match mine ({}), request ignored",
                     bundleSymbolicName, symbolicName);
             return null;

Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/DefaultSolrDirectoryManager.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/DefaultSolrDirectoryManager.java?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/DefaultSolrDirectoryManager.java (original)
+++ incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/DefaultSolrDirectoryManager.java Thu Jun 30 12:00:30 2011
@@ -273,7 +273,9 @@ public class DefaultSolrDirectoryManager
                             + UNINITIALISED_INDEX_ARCHIVE_NAME_KEY + " property!");
         }
         propMap.remove(UNINITIALISED_INDEX_ARCHIVE_NAME_KEY);// do not parse this internal property
-        InputStream is = getDataFileProvider().getInputStream(symbolicName, archiveName, propMap);
+        //we need to parse null as bundleSymbolic name, because we will accept
+        //index data from any bundle!
+        InputStream is = getDataFileProvider().getInputStream(null, archiveName, propMap);
         return is == null ? null : ConfigUtils.getArchiveInputStream(archiveName, is);
     }
 

Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java (original)
+++ incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java Thu Jun 30 12:00:30 2011
@@ -114,11 +114,12 @@ import org.slf4j.LoggerFactory;
  * @author Rupert Westenthaler
  * 
  */
-@Component(metatype = true, configurationFactory = true, policy = ConfigurationPolicy.REQUIRE, // the ID and
-// SOLR_SERVER_LOCATION
-// are
-// required!
-specVersion = "1.1")
+@Component(
+    metatype = true, 
+    immediate = true,
+    configurationFactory = true, 
+    policy = ConfigurationPolicy.REQUIRE,
+    specVersion = "1.1")
 @Service
 @Properties(
     value = {
@@ -861,16 +862,30 @@ public class SolrYard extends AbstractYa
         } // else we need to do nothing
         inputDocument.addField(fieldMapper.getDocumentIdField(), representation.getId());
         // first process the document boost
-        float documentBoost = documentBoostFieldName == null ? 1.0f : getDocumentBoost(representation);
+        Float documentBoost = getDocumentBoost(representation);
+        //NOTE: Do not use DocumentBoost, because FieldBoost will override
+        //      document boosts and are not multiplied with with document boosts
+//        if(documentBoost != null){
+//            inputDocument.setDocumentBoost(documentBoost);
+//        }
         for (Iterator<String> fields = representation.getFieldNames(); fields.hasNext();) {
             // TODO: maybe add some functionality to prevent indexing of the
             // field configured as documentBoostFieldName!
             // But this would also prevent the possibility to intentionally
             // override the boost.
             String field = fields.next();
+            float boost;
             Float fieldBoost = fieldBoostMap == null ? null : fieldBoostMap.get(field);
-            float boost = fieldBoost == null ? documentBoost : fieldBoost >= 0 ? fieldBoost * documentBoost
-                    : documentBoost;
+            if(documentBoost != null){
+                boost = documentBoost;
+                if(fieldBoost != null){
+                    boost = boost*fieldBoost;
+                }
+            } else if(fieldBoost != null){
+                boost = fieldBoost;
+            } else {
+                boost = -1;
+            }
             for (Iterator<Object> values = representation.get(field); values.hasNext();) {
                 // now we need to get the indexField for the value
                 Object next = values.next();
@@ -878,7 +893,11 @@ public class SolrYard extends AbstractYa
                 try {
                     value = indexValueFactory.createIndexValue(next);
                     for (String fieldName : fieldMapper.getFieldNames(Arrays.asList(field), value)) {
-                        inputDocument.addField(fieldName, value.getValue(), boost);
+                        if(boost > 0){
+                            inputDocument.addField(fieldName, value.getValue(), boost);
+                        } else {
+                            inputDocument.addField(fieldName, value.getValue());
+                        }
                     }
                 } catch (Exception e) {
                     log.warn(
@@ -897,13 +916,13 @@ public class SolrYard extends AbstractYa
      *            the representation
      * @return the Boost or <code>null</code> if not found or lower equals zero
      */
-    private float getDocumentBoost(Representation representation) {
+    private Float getDocumentBoost(Representation representation) {
         if (documentBoostFieldName == null) {
-            return 1.0f;
+            return null;
         }
         Float documentBoost = null;
-        for (Iterator<Object> values = representation.get(documentBoostFieldName); values.hasNext()
-                                                                                   && documentBoost == null;) {
+        for (Iterator<Object> values = representation.get(documentBoostFieldName); 
+                values.hasNext() && documentBoost == null;) {
             Object value = values.next();
             if (value instanceof Float) {
                 documentBoost = (Float) value;
@@ -918,7 +937,7 @@ public class SolrYard extends AbstractYa
                 }
             }
         }
-        return documentBoost == null ? 1.0f : documentBoost >= 0 ? documentBoost : 1.0f;
+        return documentBoost == null ? null : documentBoost >= 0 ? documentBoost : null;
     }
 
     @Override

Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip?rev=1141496&r1=1141495&r2=1141496&view=diff
==============================================================================
Binary files - no diff available.