You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by David Thibault <dt...@esperion.com> on 2010/07/26 21:00:36 UTC

Solr 3.1 and ExtractingRequestHandler resulting in blank content

Hello all,

I’m working on a project with Solr.  I had 1.4.1 working OK using ExtractingRequestHandler except that it was crashing on some PDFs.  I noticed that Tika bundled with 1.4.1 was 0.4, which was kind of old.  I decided to try updating to 0.7 as per the directions here: http://wiki.apache.org/solr/ExtractingRequestHandler  but it was giving me errors (I forget what they were specifically).

Then I tried downloading Solr 3.1 from the source repository, which I noticed came with Tika 0.7.  I figured this would be an easier route to get working.  Now I’m testing with 3.1 and 0.7 and I’m noticing my documents are going into Solr OK, but they all have blank content (no document text stored in Solr).  I did see that the default “text” field is not stored. Changing that to stored=true didn’t help.  Changing to fmap.content=attr_content&uprefix=attr_content didn’t help either.  I have attached all relevant info here.  Please let me know if someone sees something I don’t (it’s entirely possible as I’m relatively new to Solr).

Schema.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="example" version="1.3">
  <types>
    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
    <fieldtype name="binary" class="solr.BinaryField"/>
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
    <fieldType name="pint" class="solr.IntField" omitNorms="true"/>
    <fieldType name="plong" class="solr.LongField" omitNorms="true"/>
    <fieldType name="pfloat" class="solr.FloatField" omitNorms="true"/>
    <fieldType name="pdouble" class="solr.DoubleField" omitNorms="true"/>
    <fieldType name="pdate" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="random" class="solr.RandomSortField" indexed="true" />
    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
      <analyzer>
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.TrimFilterFactory" />
        <filter class="solr.PatternReplaceFilterFactory"
                pattern="([^a-z])" replacement="" replace="all"
        />
      </analyzer>
    </fieldType>

    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
      <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
      </analyzer>
    </fieldtype>

    <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
      </analyzer>
    </fieldtype>
    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory" />
      </analyzer>
    </fieldType>
    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
    <fieldType name="location" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
    <fieldtype name="geohash" class="solr.GeoHashField"/>
    <fieldType name="tile" class="solr.SpatialTileField" start="4" end="15" subFieldSuffix="_tiled"/>

 </types>


 <fields>
   <field name="id" type="string" indexed="true" stored="true" required="true" />
   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
   <field name="name" type="textgen" indexed="true" stored="true"/>
   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
   <field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" />
   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />

   <field name="weight" type="float" indexed="true" stored="true"/>
   <field name="price"  type="float" indexed="true" stored="true"/>
   <field name="popularity" type="int" indexed="true" stored="true" />
   <field name="inStock" type="boolean" indexed="true" stored="true" />

   <field name="store" type="location" indexed="true" stored="true"/>
   <field name="store_hash" type="geohash" indexed="true" stored="false"/>
   <field name="store_tiles" type="tile" indexed="true" stored="false"/>
   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="subject" type="text" indexed="true" stored="true"/>
   <field name="description" type="text" indexed="true" stored="true"/>
   <field name="comments" type="text" indexed="true" stored="true"/>
   <field name="author" type="textgen" indexed="true" stored="true"/>
   <field name="keywords" type="textgen" indexed="true" stored="true"/>
   <field name="category" type="textgen" indexed="true" stored="true"/>
   <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
   <field name="last_modified" type="date" indexed="true" stored="true"/>
   <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
   <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
   <field name="manu_exact" type="string" indexed="true" stored="false"/>

   <field name="payloads" type="payloads" indexed="true" stored="true"/>
   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>

   <dynamicField name="*_tiled"  type="double" indexed="true"  stored="false"/>

   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>

   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
   <dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
   <dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
   <dynamicField name="*_td" type="tdouble" indexed="true"  stored="true"/>
   <dynamicField name="*_tdt" type="tdate"  indexed="true"  stored="true"/>

   <dynamicField name="*_pi"  type="pint"    indexed="true"  stored="true"/>

   <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
   <dynamicField name="attr_*" type="textgen" indexed="true" stored="true" multiValued="true"/>

   <dynamicField name="random_*" type="random" />
 </fields>
 <uniqueKey>id</uniqueKey>

 <defaultSearchField>text</defaultSearchField>

 <solrQueryParser defaultOperator="OR"/>

   <copyField source="cat" dest="text"/>
   <copyField source="store" dest="store_hash"/>
   <copyField source="store" dest="store_tiles"/>
   <copyField source="name" dest="text"/>
   <copyField source="manu" dest="text"/>
   <copyField source="features" dest="text"/>
   <copyField source="includes" dest="text"/>
   <copyField source="manu" dest="manu_exact"/>

</schema>

Solrconfig.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<config>
  <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>

  <luceneMatchVersion>LUCENE_31</luceneMatchVersion>

  <lib dir="./contrib/extraction/lib" />
  <lib dir="./lib"/>
  <lib dir="./contrib/clustering/lib" />
  <dataDir>C:/Program Files/Apache Software Foundation/solr-3.1/data</dataDir>
  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
  <indexDefaults>
    <useCompoundFile>false</useCompoundFile>

    <mergeFactor>10</mergeFactor>
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <maxFieldLength>10000</maxFieldLength>
    <writeLockTimeout>1000</writeLockTimeout>
    <commitLockTimeout>10000</commitLockTimeout>
    <lockType>native</lockType>

  </indexDefaults>

  <mainIndex>

    <useCompoundFile>false</useCompoundFile>
    <ramBufferSizeMB>32</ramBufferSizeMB>
    <mergeFactor>10</mergeFactor>

    <unlockOnStartup>false</unlockOnStartup>

    <reopenReaders>true</reopenReaders>


    <deletionPolicy class="solr.SolrDeletionPolicy">
      <str name="maxCommitsToKeep">1</str>
      <str name="maxOptimizedCommitsToKeep">0</str>

    </deletionPolicy>

     <infoStream file="INFOSTREAM.txt">false</infoStream>

  </mainIndex>

  <jmx />

   <updateHandler class="solr.DirectUpdateHandler2">

  </updateHandler>



  <query>
    <maxBooleanClauses>1024</maxBooleanClauses>
    <filterCache
      class="solr.FastLRUCache"
      size="512"
      initialSize="512"
      autowarmCount="0"/>
    <queryResultCache
      class="solr.LRUCache"
      size="512"
      initialSize="512"
      autowarmCount="0"/>
    <documentCache
      class="solr.LRUCache"
      size="512"
      initialSize="512"
      autowarmCount="0"/>
    <enableLazyFieldLoading>true</enableLazyFieldLoading>
    <queryResultWindowSize>20</queryResultWindowSize>
    <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
    <listener event="newSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
      </arr>
    </listener>
    <listener event="firstSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
        <lst> <str name="q">solr rocks</str><str name="start">0</str><str name="rows">10</str></lst>
        <lst><str name="q">static firstSearcher warming query from solrconfig.xml</str></lst>
      </arr>
    </listener>
    <useColdSearcher>false</useColdSearcher>
    <maxWarmingSearchers>2</maxWarmingSearchers>

  </query>
  <requestDispatcher handleSelect="true" >
    <requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="2048000" />
    <httpCaching lastModifiedFrom="openTime"
                 etagSeed="Solr">
    </httpCaching>
  </requestDispatcher>
  <requestHandler name="standard" class="solr.SearchHandler" default="true">
    <!-- default values for query parameters -->
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <!--
       <int name="rows">10</int>
       <str name="fl">*</str>
       <str name="version">2.1</str>
        -->
     </lst>
  </requestHandler>
  <requestHandler name="/browse" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="wt">velocity</str>

       <str name="v.template">browse</str>
       <str name="v.layout">layout</str>
       <str name="title">Solritas</str>

       <str name="defType">dismax</str>
       <str name="q.alt">*:*</str>
       <str name="rows">10</str>
       <str name="fl">*,score</str>

       <str name="facet">on</str>
       <str name="facet.field">cat</str>
       <str name="facet.field">manu_exact</str>
       <str name="facet.mincount">1</str>
       <str name="qf">
          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
       </str>

       <str name="hl">on</str>
       <str name="hl.fl">text features name</str>
       <str name="f.name.hl.fragsize">0</str>
       <str name="f.name.hl.alternateField">name</str>
     </lst>
  </requestHandler>
  <requestHandler name="dismax" class="solr.SearchHandler" >
    <lst name="defaults">
     <str name="defType">dismax</str>
     <str name="echoParams">explicit</str>
     <float name="tie">0.01</float>
     <str name="qf">
        text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
     </str>
     <str name="pf">
        text^0.2 features^1.1 name^1.5 manu^1.4 manu_exact^1.9
     </str>
     <str name="bf">
        popularity^0.5 recip(price,1,1000,1000)^0.3
     </str>
     <str name="fl">
        id,name,price,score
     </str>
     <str name="mm">
        2&lt;-1 5&lt;-2 6&lt;90%
     </str>
     <int name="ps">100</int>
     <str name="q.alt">*:*</str>
     <!-- example highlighter config, enable per-query with hl=true -->
     <str name="hl.fl">text features name</str>
     <!-- for this field, we want no fragmenting, just highlighting -->
     <str name="f.name.hl.fragsize">0</str>
     <!-- instructs Solr to return the field itself if no query terms are
          found -->
     <str name="f.name.hl.alternateField">name</str>
     <str name="f.text.hl.fragmenter">regex</str> <!-- defined below -->
    </lst>
  </requestHandler>
  <requestHandler name="partitioned" class="solr.SearchHandler" >
    <lst name="defaults">
     <str name="defType">dismax</str>
     <str name="echoParams">explicit</str>
     <str name="qf">text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0</str>
     <str name="mm">2&lt;-1 5&lt;-2 6&lt;90%</str>
     <!-- This is an example of using Date Math to specify a constantly
          moving date range in a config...
       -->
     <str name="bq">incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2</str>
    </lst>
    <lst name="appends">
      <str name="fq">inStock:true</str>
    </lst>
    <lst name="invariants">
      <str name="facet.field">cat</str>
      <str name="facet.field">manu_exact</str>
      <str name="facet.query">price:[* TO 500]</str>
      <str name="facet.query">price:[500 TO *]</str>
    </lst>
  </requestHandler>
  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">

    <str name="queryAnalyzerFieldType">textSpell</str>

    <lst name="spellchecker">
      <str name="name">default</str>
      <str name="field">name</str>
      <str name="spellcheckIndexDir">./spellchecker</str>
    </lst>
  </searchComponent>
  <requestHandler name="/spell" class="solr.SearchHandler" lazy="true">
    <lst name="defaults">
      <str name="spellcheck.onlyMorePopular">false</str>
      <str name="spellcheck.extendedResults">false</str>
      <str name="spellcheck.count">1</str>
    </lst>
    <arr name="last-components">
      <str>spellcheck</str>
    </arr>
  </requestHandler>

  <searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
  <requestHandler name="tvrh" class="org.apache.solr.handler.component.SearchHandler">
    <lst name="defaults">
      <bool name="tv">true</bool>
    </lst>
    <arr name="last-components">
      <str>tvComponent</str>
    </arr>
  </requestHandler>
  <searchComponent
    name="clusteringComponent"
    enable="${solr.clustering.enabled:false}"
    class="org.apache.solr.handler.clustering.ClusteringComponent" >
    <lst name="engine">
      <!-- The name, only one can be named "default" -->
      <str name="name">default</str>
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
    </lst>
    <lst name="engine">
      <str name="name">stc</str>
      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
    </lst>
  </searchComponent>
  <requestHandler name="/clustering"
                  enable="${solr.clustering.enabled:false}"
                  class="solr.SearchHandler">
     <lst name="defaults">
       <bool name="clustering">true</bool>
       <str name="clustering.engine">default</str>
       <bool name="clustering.results">true</bool>
       <!-- The title field -->
       <str name="carrot.title">name</str>
       <str name="carrot.url">id</str>
       <!-- The field to cluster on -->
       <str name="carrot.snippet">features</str>
       <!-- produce summaries -->
       <bool name="carrot.produceSummary">true</bool>
       <!-- the maximum number of labels per cluster -->
       <!--<int name="carrot.numDescriptions">5</int>-->
       <!-- produce sub clusters -->
       <bool name="carrot.outputSubClusters">false</bool>
    </lst>
    <arr name="last-components">
      <str>clusteringComponent</str>
    </arr>
  </requestHandler>

  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" startup="lazy">
    <lst name="defaults">
      <str name="fmap.content">text</str>
      <str name="lowernames">true</str>
      <str name="uprefix">ignored_</str>

      <!-- capture link hrefs but ignore div attributes -->
      <str name="captureAttr">true</str>
      <str name="fmap.a">links</str>
      <str name="fmap.div">ignored_</str>
    </lst>
  </requestHandler>


  <searchComponent name="termsComponent" class="org.apache.solr.handler.component.TermsComponent"/>

  <requestHandler name="/terms" class="org.apache.solr.handler.component.SearchHandler">
     <lst name="defaults">
      <bool name="terms">true</bool>
    </lst>
    <arr name="components">
      <str>termsComponent</str>
    </arr>
  </requestHandler>
  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
    <!-- pick a fieldType to analyze queries -->
    <str name="queryFieldType">string</str>
    <str name="config-file">elevate.xml</str>
  </searchComponent>

  <!-- a request handler utilizing the elevator component -->
  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="echoParams">explicit</str>
    </lst>
    <arr name="last-components">
      <str>elevator</str>
    </arr>
  </requestHandler>
  <requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />


  <requestHandler name="/update/javabin" class="solr.BinaryUpdateRequestHandler" />

  <requestHandler name="/analysis/document" class="solr.DocumentAnalysisRequestHandler" />
  <requestHandler name="/analysis/field" class="solr.FieldAnalysisRequestHandler" />
  <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
  <requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" />
  <requestHandler name="/admin/ping" class="PingRequestHandler">
    <lst name="defaults">
      <str name="qt">standard</str>
      <str name="q">solrpingquery</str>
      <str name="echoParams">all</str>
    </lst>
  </requestHandler>

  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
    <lst name="defaults">
     <str name="echoParams">explicit</str> <!-- for all params (including the default etc) use: 'all' -->
     <str name="echoHandler">true</str>
    </lst>
  </requestHandler>
  <searchComponent class="solr.HighlightComponent" name="highlight">
  <highlighting>
   <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
    <lst name="defaults">
     <int name="hl.fragsize">100</int>
    </lst>
   </fragmenter>

   <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
    <lst name="defaults">
      <!-- slightly smaller fragsizes work better because of slop -->
      <int name="hl.fragsize">70</int>
      <!-- allow 50% slop on fragment sizes -->
      <float name="hl.regex.slop">0.5</float>
      <!-- a basic sentence pattern -->
      <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
    </lst>
   </fragmenter>

   <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
    <lst name="defaults">
     <str name="hl.simple.pre"><![CDATA[<em>]]></str>
     <str name="hl.simple.post"><![CDATA[</em>]]></str>
    </lst>
   </formatter>

   <fragListBuilder name="simple" class="org.apache.solr.highlight.SimpleFragListBuilder" default="true"/>

   <fragListBuilder name="single" class="org.apache.solr.highlight.SingleFragListBuilder"/>

   <fragmentsBuilder name="colored" class="org.apache.solr.highlight.MultiColoredScoreOrderFragmentsBuilder" default="true"/>
  </highlighting>
  </searchComponent>
  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
    <int name="xsltCacheLifetimeSeconds">5</int>
  </queryResponseWriter>
  <admin>
    <defaultQuery>solr</defaultQuery>
  </admin>

</config>

Test1.txt document:
Asdf
Asdf
Asdf
Adsf

Upload command:
curl "http://localhost:8080/solr/update/extract?literal.id=123&uprefix=attr_&fmap.content=attr_content&commit=true" -F "myfile=@test1.txt”

RESULTS from an id:[* TO *] query:
<response>
−
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">91</int>
−
<lst name="params">
<str name="explainOther"/>
<str name="fl">*,score</str>
<str name="indent">on</str>
<str name="start">0</str>
<str name="q">id:[* TO *]</str>
<str name="hl.fl"/>
<str name="qt">standard</str>
<str name="wt">standard</str>
<str name="fq"/>
<str name="rows">10</str>
<str name="version">2.2</str>
</lst>
</lst>
−
<result name="response" numFound="1" start="0" maxScore="1.0">
−
<doc>
<float name="score">1.0</float>
−
<arr name="attr_content">
<str>        </str>
</arr>
−
<arr name="attr_stream_content_type">
<str>text/plain</str>
</arr>
−
<arr name="attr_stream_name">
<str>test1.txt</str>
</arr>
−
<arr name="attr_stream_size">
<str>24</str>
</arr>
−
<arr name="attr_stream_source_info">
<str>myfile</str>
</arr>
−
<arr name="content_type">
<str>text/plain</str>
</arr>
<str name="id">123</str>
</doc>
</result>
</response>

Note that the attr_content section of the response is blank.  Any help & hints would be GREATLY appreciated…=)

Best,
Dave

RE: Solr 3.1 and ExtractingRequestHandler resulting in blank content

Posted by David Thibault <dt...@esperion.com>.
If you don't store the content then you can't do highlighting, right?  Also, don't you just have to switch the text field to say stored="true" in your schema to store the text?  I don't understand why you're differentiating the behavior of ExtractingRequestHandler from the behavior of Solr in general.  Doesn't ExtractingRequestHandler just pull the text out of whatever file you send it and then the rest of the processing happens like any other Solr post?

The bug I was experiencing was the same one that someone else brought up on the list yesterday in the emails entitled "Extracting PDF text/comment/callout/typewriter boxes with Solr   CELL/Tika/PDFBox".  It ties back to this bug:
https://issues.apache.org/jira/browse/SOLR-1902?page=com.atlassian.jira.plugin.ext.subversion%3Asubversion-commits-tabpanel

I saw that email shortly after I sent this one to the list (it figures, doesn't it...=).

I tried doing what they suggested on that bug report (patching Solr 1.4.x and using Tika 0.8-SNAPSHOT), but the patches failed when I applied it to my Solr 1.4.1.  They have since added a patch for Solr 1.4.1.  I haven't tried it yet.  However, I did get it working using Solr 4.0 out of trunk (which also uses Tika 0.8 and updated PDFBox jars).  I have yet to decide which will be more stable, Solr 4.0 or patched Solr 1.4.1, both of which with updated PDFbox and Tika jars.

Best,
Dave

-----Original Message-----
From: Lance Norskog [mailto:goksron@gmail.com]
Sent: Tuesday, July 27, 2010 8:09 PM
To: solr-user@lucene.apache.org
Subject: Re: Solr 3.1 and ExtractingRequestHandler resulting in blank content

There are two different datasets that Solr (Lucene really) saves from
a document: raw storage and the indexed terms. I don't think the
ExtractingRequestHandler ever automatically stored the raw data; in
fact Lucene works in Strings internally, not raw byte arrays (this is
changing).

It should be indexed- that means if you search 'text' with a word from
the document, it will find those documents and bring back the file
name. Your app has to then use the file name.  Solr/Lucene is not
intended as a general-purpose content store, only an index.

The ERH wiki page doesn't quite say this. It describes what the ERH
does rather than what it does not do :)

On Mon, Jul 26, 2010 at 12:00 PM, David Thibault <dt...@esperion.com> wrote:
> Hello all,
>
> I’m working on a project with Solr.  I had 1.4.1 working OK using ExtractingRequestHandler except that it was crashing on some PDFs.  I noticed that Tika bundled with 1.4.1 was 0.4, which was kind of old.  I decided to try updating to 0.7 as per the directions here: http://wiki.apache.org/solr/ExtractingRequestHandler  but it was giving me errors (I forget what they were specifically).
>
> Then I tried downloading Solr 3.1 from the source repository, which I noticed came with Tika 0.7.  I figured this would be an easier route to get working.  Now I’m testing with 3.1 and 0.7 and I’m noticing my documents are going into Solr OK, but they all have blank content (no document text stored in Solr).  I did see that the default “text” field is not stored. Changing that to stored=true didn’t help.  Changing to fmap.content=attr_content&uprefix=attr_content didn’t help either.  I have attached all relevant info here.  Please let me know if someone sees something I don’t (it’s entirely possible as I’m relatively new to Solr).
>
> Schema.xml:
> <?xml version="1.0" encoding="UTF-8" ?>
> <schema name="example" version="1.3">
>  <types>
>    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
>    <fieldtype name="binary" class="solr.BinaryField"/>
>    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
>    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
>    <fieldType name="pint" class="solr.IntField" omitNorms="true"/>
>    <fieldType name="plong" class="solr.LongField" omitNorms="true"/>
>    <fieldType name="pfloat" class="solr.FloatField" omitNorms="true"/>
>    <fieldType name="pdouble" class="solr.DoubleField" omitNorms="true"/>
>    <fieldType name="pdate" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="random" class="solr.RandomSortField" indexed="true" />
>    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
>      <analyzer>
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
>      <analyzer type="index">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
>        <filter class="solr.PorterStemFilterFactory"/>
>      </analyzer>
>      <analyzer type="query">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
>        <filter class="solr.PorterStemFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
>      <analyzer>
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
>        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
>        <filter class="solr.PorterStemFilterFactory"/>
>        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
>      <analyzer type="index">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>      </analyzer>
>      <analyzer type="query">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
>      <analyzer type="index">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
>           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
>      </analyzer>
>      <analyzer type="query">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
>      <analyzer>
>        <tokenizer class="solr.KeywordTokenizerFactory"/>
>        <filter class="solr.LowerCaseFilterFactory" />
>        <filter class="solr.TrimFilterFactory" />
>        <filter class="solr.PatternReplaceFilterFactory"
>                pattern="([^a-z])" replacement="" replace="all"
>        />
>      </analyzer>
>    </fieldType>
>
>    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
>      <analyzer>
>        <tokenizer class="solr.StandardTokenizerFactory"/>
>        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
>      </analyzer>
>    </fieldtype>
>
>    <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
>      <analyzer>
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
>      </analyzer>
>    </fieldtype>
>    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
>      <analyzer>
>        <tokenizer class="solr.KeywordTokenizerFactory"/>
>        <filter class="solr.LowerCaseFilterFactory" />
>      </analyzer>
>    </fieldType>
>    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
>    <fieldType name="location" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
>    <fieldtype name="geohash" class="solr.GeoHashField"/>
>    <fieldType name="tile" class="solr.SpatialTileField" start="4" end="15" subFieldSuffix="_tiled"/>
>
>  </types>
>
>
>  <fields>
>   <field name="id" type="string" indexed="true" stored="true" required="true" />
>   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
>   <field name="name" type="textgen" indexed="true" stored="true"/>
>   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
>   <field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
>   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" />
>   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
>   <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
>
>   <field name="weight" type="float" indexed="true" stored="true"/>
>   <field name="price"  type="float" indexed="true" stored="true"/>
>   <field name="popularity" type="int" indexed="true" stored="true" />
>   <field name="inStock" type="boolean" indexed="true" stored="true" />
>
>   <field name="store" type="location" indexed="true" stored="true"/>
>   <field name="store_hash" type="geohash" indexed="true" stored="false"/>
>   <field name="store_tiles" type="tile" indexed="true" stored="false"/>
>   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
>   <field name="subject" type="text" indexed="true" stored="true"/>
>   <field name="description" type="text" indexed="true" stored="true"/>
>   <field name="comments" type="text" indexed="true" stored="true"/>
>   <field name="author" type="textgen" indexed="true" stored="true"/>
>   <field name="keywords" type="textgen" indexed="true" stored="true"/>
>   <field name="category" type="textgen" indexed="true" stored="true"/>
>   <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
>   <field name="last_modified" type="date" indexed="true" stored="true"/>
>   <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
>   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
>   <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
>   <field name="manu_exact" type="string" indexed="true" stored="false"/>
>
>   <field name="payloads" type="payloads" indexed="true" stored="true"/>
>   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
>   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
>   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
>   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
>   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
>   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
>   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
>
>   <dynamicField name="*_tiled"  type="double" indexed="true"  stored="false"/>
>
>   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
>   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
>
>   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
>   <dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
>   <dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
>   <dynamicField name="*_td" type="tdouble" indexed="true"  stored="true"/>
>   <dynamicField name="*_tdt" type="tdate"  indexed="true"  stored="true"/>
>
>   <dynamicField name="*_pi"  type="pint"    indexed="true"  stored="true"/>
>
>   <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
>   <dynamicField name="attr_*" type="textgen" indexed="true" stored="true" multiValued="true"/>
>
>   <dynamicField name="random_*" type="random" />
>  </fields>
>  <uniqueKey>id</uniqueKey>
>
>  <defaultSearchField>text</defaultSearchField>
>
>  <solrQueryParser defaultOperator="OR"/>
>
>   <copyField source="cat" dest="text"/>
>   <copyField source="store" dest="store_hash"/>
>   <copyField source="store" dest="store_tiles"/>
>   <copyField source="name" dest="text"/>
>   <copyField source="manu" dest="text"/>
>   <copyField source="features" dest="text"/>
>   <copyField source="includes" dest="text"/>
>   <copyField source="manu" dest="manu_exact"/>
>
> </schema>
>
> Solrconfig.xml:
> <?xml version="1.0" encoding="UTF-8" ?>
> <config>
>  <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>
>
>  <luceneMatchVersion>LUCENE_31</luceneMatchVersion>
>
>  <lib dir="./contrib/extraction/lib" />
>  <lib dir="./lib"/>
>  <lib dir="./contrib/clustering/lib" />
>  <dataDir>C:/Program Files/Apache Software Foundation/solr-3.1/data</dataDir>
>  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
>  <indexDefaults>
>    <useCompoundFile>false</useCompoundFile>
>
>    <mergeFactor>10</mergeFactor>
>    <ramBufferSizeMB>32</ramBufferSizeMB>
>    <maxFieldLength>10000</maxFieldLength>
>    <writeLockTimeout>1000</writeLockTimeout>
>    <commitLockTimeout>10000</commitLockTimeout>
>    <lockType>native</lockType>
>
>  </indexDefaults>
>
>  <mainIndex>
>
>    <useCompoundFile>false</useCompoundFile>
>    <ramBufferSizeMB>32</ramBufferSizeMB>
>    <mergeFactor>10</mergeFactor>
>
>    <unlockOnStartup>false</unlockOnStartup>
>
>    <reopenReaders>true</reopenReaders>
>
>
>    <deletionPolicy class="solr.SolrDeletionPolicy">
>      <str name="maxCommitsToKeep">1</str>
>      <str name="maxOptimizedCommitsToKeep">0</str>
>
>    </deletionPolicy>
>
>     <infoStream file="INFOSTREAM.txt">false</infoStream>
>
>  </mainIndex>
>
>  <jmx />
>
>   <updateHandler class="solr.DirectUpdateHandler2">
>
>  </updateHandler>
>
>
>
>  <query>
>    <maxBooleanClauses>1024</maxBooleanClauses>
>    <filterCache
>      class="solr.FastLRUCache"
>      size="512"
>      initialSize="512"
>      autowarmCount="0"/>
>    <queryResultCache
>      class="solr.LRUCache"
>      size="512"
>      initialSize="512"
>      autowarmCount="0"/>
>    <documentCache
>      class="solr.LRUCache"
>      size="512"
>      initialSize="512"
>      autowarmCount="0"/>
>    <enableLazyFieldLoading>true</enableLazyFieldLoading>
>    <queryResultWindowSize>20</queryResultWindowSize>
>    <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
>    <listener event="newSearcher" class="solr.QuerySenderListener">
>      <arr name="queries">
>      </arr>
>    </listener>
>    <listener event="firstSearcher" class="solr.QuerySenderListener">
>      <arr name="queries">
>        <lst> <str name="q">solr rocks</str><str name="start">0</str><str name="rows">10</str></lst>
>        <lst><str name="q">static firstSearcher warming query from solrconfig.xml</str></lst>
>      </arr>
>    </listener>
>    <useColdSearcher>false</useColdSearcher>
>    <maxWarmingSearchers>2</maxWarmingSearchers>
>
>  </query>
>  <requestDispatcher handleSelect="true" >
>    <requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="2048000" />
>    <httpCaching lastModifiedFrom="openTime"
>                 etagSeed="Solr">
>    </httpCaching>
>  </requestDispatcher>
>  <requestHandler name="standard" class="solr.SearchHandler" default="true">
>    <!-- default values for query parameters -->
>     <lst name="defaults">
>       <str name="echoParams">explicit</str>
>       <!--
>       <int name="rows">10</int>
>       <str name="fl">*</str>
>       <str name="version">2.1</str>
>        -->
>     </lst>
>  </requestHandler>
>  <requestHandler name="/browse" class="solr.SearchHandler">
>     <lst name="defaults">
>       <str name="wt">velocity</str>
>
>       <str name="v.template">browse</str>
>       <str name="v.layout">layout</str>
>       <str name="title">Solritas</str>
>
>       <str name="defType">dismax</str>
>       <str name="q.alt">*:*</str>
>       <str name="rows">10</str>
>       <str name="fl">*,score</str>
>
>       <str name="facet">on</str>
>       <str name="facet.field">cat</str>
>       <str name="facet.field">manu_exact</str>
>       <str name="facet.mincount">1</str>
>       <str name="qf">
>          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
>       </str>
>
>       <str name="hl">on</str>
>       <str name="hl.fl">text features name</str>
>       <str name="f.name.hl.fragsize">0</str>
>       <str name="f.name.hl.alternateField">name</str>
>     </lst>
>  </requestHandler>
>  <requestHandler name="dismax" class="solr.SearchHandler" >
>    <lst name="defaults">
>     <str name="defType">dismax</str>
>     <str name="echoParams">explicit</str>
>     <float name="tie">0.01</float>
>     <str name="qf">
>        text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
>     </str>
>     <str name="pf">
>        text^0.2 features^1.1 name^1.5 manu^1.4 manu_exact^1.9
>     </str>
>     <str name="bf">
>        popularity^0.5 recip(price,1,1000,1000)^0.3
>     </str>
>     <str name="fl">
>        id,name,price,score
>     </str>
>     <str name="mm">
>        2&lt;-1 5&lt;-2 6&lt;90%
>     </str>
>     <int name="ps">100</int>
>     <str name="q.alt">*:*</str>
>     <!-- example highlighter config, enable per-query with hl=true -->
>     <str name="hl.fl">text features name</str>
>     <!-- for this field, we want no fragmenting, just highlighting -->
>     <str name="f.name.hl.fragsize">0</str>
>     <!-- instructs Solr to return the field itself if no query terms are
>          found -->
>     <str name="f.name.hl.alternateField">name</str>
>     <str name="f.text.hl.fragmenter">regex</str> <!-- defined below -->
>    </lst>
>  </requestHandler>
>  <requestHandler name="partitioned" class="solr.SearchHandler" >
>    <lst name="defaults">
>     <str name="defType">dismax</str>
>     <str name="echoParams">explicit</str>
>     <str name="qf">text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0</str>
>     <str name="mm">2&lt;-1 5&lt;-2 6&lt;90%</str>
>     <!-- This is an example of using Date Math to specify a constantly
>          moving date range in a config...
>       -->
>     <str name="bq">incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2</str>
>    </lst>
>    <lst name="appends">
>      <str name="fq">inStock:true</str>
>    </lst>
>    <lst name="invariants">
>      <str name="facet.field">cat</str>
>      <str name="facet.field">manu_exact</str>
>      <str name="facet.query">price:[* TO 500]</str>
>      <str name="facet.query">price:[500 TO *]</str>
>    </lst>
>  </requestHandler>
>  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
>
>    <str name="queryAnalyzerFieldType">textSpell</str>
>
>    <lst name="spellchecker">
>      <str name="name">default</str>
>      <str name="field">name</str>
>      <str name="spellcheckIndexDir">./spellchecker</str>
>    </lst>
>  </searchComponent>
>  <requestHandler name="/spell" class="solr.SearchHandler" lazy="true">
>    <lst name="defaults">
>      <str name="spellcheck.onlyMorePopular">false</str>
>      <str name="spellcheck.extendedResults">false</str>
>      <str name="spellcheck.count">1</str>
>    </lst>
>    <arr name="last-components">
>      <str>spellcheck</str>
>    </arr>
>  </requestHandler>
>
>  <searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
>  <requestHandler name="tvrh" class="org.apache.solr.handler.component.SearchHandler">
>    <lst name="defaults">
>      <bool name="tv">true</bool>
>    </lst>
>    <arr name="last-components">
>      <str>tvComponent</str>
>    </arr>
>  </requestHandler>
>  <searchComponent
>    name="clusteringComponent"
>    enable="${solr.clustering.enabled:false}"
>    class="org.apache.solr.handler.clustering.ClusteringComponent" >
>    <lst name="engine">
>      <!-- The name, only one can be named "default" -->
>      <str name="name">default</str>
>      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
>      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
>    </lst>
>    <lst name="engine">
>      <str name="name">stc</str>
>      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
>    </lst>
>  </searchComponent>
>  <requestHandler name="/clustering"
>                  enable="${solr.clustering.enabled:false}"
>                  class="solr.SearchHandler">
>     <lst name="defaults">
>       <bool name="clustering">true</bool>
>       <str name="clustering.engine">default</str>
>       <bool name="clustering.results">true</bool>
>       <!-- The title field -->
>       <str name="carrot.title">name</str>
>       <str name="carrot.url">id</str>
>       <!-- The field to cluster on -->
>       <str name="carrot.snippet">features</str>
>       <!-- produce summaries -->
>       <bool name="carrot.produceSummary">true</bool>
>       <!-- the maximum number of labels per cluster -->
>       <!--<int name="carrot.numDescriptions">5</int>-->
>       <!-- produce sub clusters -->
>       <bool name="carrot.outputSubClusters">false</bool>
>    </lst>
>    <arr name="last-components">
>      <str>clusteringComponent</str>
>    </arr>
>  </requestHandler>
>
>  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" startup="lazy">
>    <lst name="defaults">
>      <str name="fmap.content">text</str>
>      <str name="lowernames">true</str>
>      <str name="uprefix">ignored_</str>
>
>      <!-- capture link hrefs but ignore div attributes -->
>      <str name="captureAttr">true</str>
>      <str name="fmap.a">links</str>
>      <str name="fmap.div">ignored_</str>
>    </lst>
>  </requestHandler>
>
>
>  <searchComponent name="termsComponent" class="org.apache.solr.handler.component.TermsComponent"/>
>
>  <requestHandler name="/terms" class="org.apache.solr.handler.component.SearchHandler">
>     <lst name="defaults">
>      <bool name="terms">true</bool>
>    </lst>
>    <arr name="components">
>      <str>termsComponent</str>
>    </arr>
>  </requestHandler>
>  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
>    <!-- pick a fieldType to analyze queries -->
>    <str name="queryFieldType">string</str>
>    <str name="config-file">elevate.xml</str>
>  </searchComponent>
>
>  <!-- a request handler utilizing the elevator component -->
>  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
>    <lst name="defaults">
>      <str name="echoParams">explicit</str>
>    </lst>
>    <arr name="last-components">
>      <str>elevator</str>
>    </arr>
>  </requestHandler>
>  <requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
>
>
>  <requestHandler name="/update/javabin" class="solr.BinaryUpdateRequestHandler" />
>
>  <requestHandler name="/analysis/document" class="solr.DocumentAnalysisRequestHandler" />
>  <requestHandler name="/analysis/field" class="solr.FieldAnalysisRequestHandler" />
>  <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
>  <requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" />
>  <requestHandler name="/admin/ping" class="PingRequestHandler">
>    <lst name="defaults">
>      <str name="qt">standard</str>
>      <str name="q">solrpingquery</str>
>      <str name="echoParams">all</str>
>    </lst>
>  </requestHandler>
>
>  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
>    <lst name="defaults">
>     <str name="echoParams">explicit</str> <!-- for all params (including the default etc) use: 'all' -->
>     <str name="echoHandler">true</str>
>    </lst>
>  </requestHandler>
>  <searchComponent class="solr.HighlightComponent" name="highlight">
>  <highlighting>
>   <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
>    <lst name="defaults">
>     <int name="hl.fragsize">100</int>
>    </lst>
>   </fragmenter>
>
>   <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
>    <lst name="defaults">
>      <!-- slightly smaller fragsizes work better because of slop -->
>      <int name="hl.fragsize">70</int>
>      <!-- allow 50% slop on fragment sizes -->
>      <float name="hl.regex.slop">0.5</float>
>      <!-- a basic sentence pattern -->
>      <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
>    </lst>
>   </fragmenter>
>
>   <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
>    <lst name="defaults">
>     <str name="hl.simple.pre"><![CDATA[<em>]]></str>
>     <str name="hl.simple.post"><![CDATA[</em>]]></str>
>    </lst>
>   </formatter>
>
>   <fragListBuilder name="simple" class="org.apache.solr.highlight.SimpleFragListBuilder" default="true"/>
>
>   <fragListBuilder name="single" class="org.apache.solr.highlight.SingleFragListBuilder"/>
>
>   <fragmentsBuilder name="colored" class="org.apache.solr.highlight.MultiColoredScoreOrderFragmentsBuilder" default="true"/>
>  </highlighting>
>  </searchComponent>
>  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
>    <int name="xsltCacheLifetimeSeconds">5</int>
>  </queryResponseWriter>
>  <admin>
>    <defaultQuery>solr</defaultQuery>
>  </admin>
>
> </config>
>
> Test1.txt document:
> Asdf
> Asdf
> Asdf
> Adsf
>
> Upload command:
> curl "http://localhost:8080/solr/update/extract?literal.id=123&uprefix=attr_&fmap.content=attr_content&commit=true" -F "myfile=@test1.txt”
>
> RESULTS from an id:[* TO *] query:
> <response>
> −
> <lst name="responseHeader">
> <int name="status">0</int>
> <int name="QTime">91</int>
> −
> <lst name="params">
> <str name="explainOther"/>
> <str name="fl">*,score</str>
> <str name="indent">on</str>
> <str name="start">0</str>
> <str name="q">id:[* TO *]</str>
> <str name="hl.fl"/>
> <str name="qt">standard</str>
> <str name="wt">standard</str>
> <str name="fq"/>
> <str name="rows">10</str>
> <str name="version">2.2</str>
> </lst>
> </lst>
> −
> <result name="response" numFound="1" start="0" maxScore="1.0">
> −
> <doc>
> <float name="score">1.0</float>
> −
> <arr name="attr_content">
> <str>        </str>
> </arr>
> −
> <arr name="attr_stream_content_type">
> <str>text/plain</str>
> </arr>
> −
> <arr name="attr_stream_name">
> <str>test1.txt</str>
> </arr>
> −
> <arr name="attr_stream_size">
> <str>24</str>
> </arr>
> −
> <arr name="attr_stream_source_info">
> <str>myfile</str>
> </arr>
> −
> <arr name="content_type">
> <str>text/plain</str>
> </arr>
> <str name="id">123</str>
> </doc>
> </result>
> </response>
>
> Note that the attr_content section of the response is blank.  Any help & hints would be GREATLY appreciated…=)
>
> Best,
> Dave
>



--
Lance Norskog
goksron@gmail.com


Re: Solr 3.1 and ExtractingRequestHandler resulting in blank content

Posted by Lance Norskog <go...@gmail.com>.
There are two different datasets that Solr (Lucene really) saves from
a document: raw storage and the indexed terms. I don't think the
ExtractingRequestHandler ever automatically stored the raw data; in
fact Lucene works in Strings internally, not raw byte arrays (this is
changing).

It should be indexed- that means if you search 'text' with a word from
the document, it will find those documents and bring back the file
name. Your app has to then use the file name.  Solr/Lucene is not
intended as a general-purpose content store, only an index.

The ERH wiki page doesn't quite say this. It describes what the ERH
does rather than what it does not do :)

On Mon, Jul 26, 2010 at 12:00 PM, David Thibault <dt...@esperion.com> wrote:
> Hello all,
>
> I’m working on a project with Solr.  I had 1.4.1 working OK using ExtractingRequestHandler except that it was crashing on some PDFs.  I noticed that Tika bundled with 1.4.1 was 0.4, which was kind of old.  I decided to try updating to 0.7 as per the directions here: http://wiki.apache.org/solr/ExtractingRequestHandler  but it was giving me errors (I forget what they were specifically).
>
> Then I tried downloading Solr 3.1 from the source repository, which I noticed came with Tika 0.7.  I figured this would be an easier route to get working.  Now I’m testing with 3.1 and 0.7 and I’m noticing my documents are going into Solr OK, but they all have blank content (no document text stored in Solr).  I did see that the default “text” field is not stored. Changing that to stored=true didn’t help.  Changing to fmap.content=attr_content&uprefix=attr_content didn’t help either.  I have attached all relevant info here.  Please let me know if someone sees something I don’t (it’s entirely possible as I’m relatively new to Solr).
>
> Schema.xml:
> <?xml version="1.0" encoding="UTF-8" ?>
> <schema name="example" version="1.3">
>  <types>
>    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
>    <fieldtype name="binary" class="solr.BinaryField"/>
>    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
>    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
>    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
>    <fieldType name="pint" class="solr.IntField" omitNorms="true"/>
>    <fieldType name="plong" class="solr.LongField" omitNorms="true"/>
>    <fieldType name="pfloat" class="solr.FloatField" omitNorms="true"/>
>    <fieldType name="pdouble" class="solr.DoubleField" omitNorms="true"/>
>    <fieldType name="pdate" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
>    <fieldType name="random" class="solr.RandomSortField" indexed="true" />
>    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
>      <analyzer>
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
>      <analyzer type="index">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
>        <filter class="solr.PorterStemFilterFactory"/>
>      </analyzer>
>      <analyzer type="query">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
>        <filter class="solr.PorterStemFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
>      <analyzer>
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
>        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
>        <filter class="solr.PorterStemFilterFactory"/>
>        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
>      <analyzer type="index">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>      </analyzer>
>      <analyzer type="query">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
>      <analyzer type="index">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
>           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
>      </analyzer>
>      <analyzer type="query">
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>        <filter class="solr.StopFilterFactory"
>                ignoreCase="true"
>                words="stopwords.txt"
>                enablePositionIncrements="true"
>                />
>        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>      </analyzer>
>    </fieldType>
>    <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
>      <analyzer>
>        <tokenizer class="solr.KeywordTokenizerFactory"/>
>        <filter class="solr.LowerCaseFilterFactory" />
>        <filter class="solr.TrimFilterFactory" />
>        <filter class="solr.PatternReplaceFilterFactory"
>                pattern="([^a-z])" replacement="" replace="all"
>        />
>      </analyzer>
>    </fieldType>
>
>    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
>      <analyzer>
>        <tokenizer class="solr.StandardTokenizerFactory"/>
>        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
>      </analyzer>
>    </fieldtype>
>
>    <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
>      <analyzer>
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
>      </analyzer>
>    </fieldtype>
>    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
>      <analyzer>
>        <tokenizer class="solr.KeywordTokenizerFactory"/>
>        <filter class="solr.LowerCaseFilterFactory" />
>      </analyzer>
>    </fieldType>
>    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
>    <fieldType name="location" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
>    <fieldtype name="geohash" class="solr.GeoHashField"/>
>    <fieldType name="tile" class="solr.SpatialTileField" start="4" end="15" subFieldSuffix="_tiled"/>
>
>  </types>
>
>
>  <fields>
>   <field name="id" type="string" indexed="true" stored="true" required="true" />
>   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
>   <field name="name" type="textgen" indexed="true" stored="true"/>
>   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
>   <field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
>   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" />
>   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
>   <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
>
>   <field name="weight" type="float" indexed="true" stored="true"/>
>   <field name="price"  type="float" indexed="true" stored="true"/>
>   <field name="popularity" type="int" indexed="true" stored="true" />
>   <field name="inStock" type="boolean" indexed="true" stored="true" />
>
>   <field name="store" type="location" indexed="true" stored="true"/>
>   <field name="store_hash" type="geohash" indexed="true" stored="false"/>
>   <field name="store_tiles" type="tile" indexed="true" stored="false"/>
>   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
>   <field name="subject" type="text" indexed="true" stored="true"/>
>   <field name="description" type="text" indexed="true" stored="true"/>
>   <field name="comments" type="text" indexed="true" stored="true"/>
>   <field name="author" type="textgen" indexed="true" stored="true"/>
>   <field name="keywords" type="textgen" indexed="true" stored="true"/>
>   <field name="category" type="textgen" indexed="true" stored="true"/>
>   <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
>   <field name="last_modified" type="date" indexed="true" stored="true"/>
>   <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
>   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
>   <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
>   <field name="manu_exact" type="string" indexed="true" stored="false"/>
>
>   <field name="payloads" type="payloads" indexed="true" stored="true"/>
>   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
>   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
>   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
>   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
>   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
>   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
>   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
>
>   <dynamicField name="*_tiled"  type="double" indexed="true"  stored="false"/>
>
>   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
>   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
>
>   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
>   <dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
>   <dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
>   <dynamicField name="*_td" type="tdouble" indexed="true"  stored="true"/>
>   <dynamicField name="*_tdt" type="tdate"  indexed="true"  stored="true"/>
>
>   <dynamicField name="*_pi"  type="pint"    indexed="true"  stored="true"/>
>
>   <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
>   <dynamicField name="attr_*" type="textgen" indexed="true" stored="true" multiValued="true"/>
>
>   <dynamicField name="random_*" type="random" />
>  </fields>
>  <uniqueKey>id</uniqueKey>
>
>  <defaultSearchField>text</defaultSearchField>
>
>  <solrQueryParser defaultOperator="OR"/>
>
>   <copyField source="cat" dest="text"/>
>   <copyField source="store" dest="store_hash"/>
>   <copyField source="store" dest="store_tiles"/>
>   <copyField source="name" dest="text"/>
>   <copyField source="manu" dest="text"/>
>   <copyField source="features" dest="text"/>
>   <copyField source="includes" dest="text"/>
>   <copyField source="manu" dest="manu_exact"/>
>
> </schema>
>
> Solrconfig.xml:
> <?xml version="1.0" encoding="UTF-8" ?>
> <config>
>  <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>
>
>  <luceneMatchVersion>LUCENE_31</luceneMatchVersion>
>
>  <lib dir="./contrib/extraction/lib" />
>  <lib dir="./lib"/>
>  <lib dir="./contrib/clustering/lib" />
>  <dataDir>C:/Program Files/Apache Software Foundation/solr-3.1/data</dataDir>
>  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
>  <indexDefaults>
>    <useCompoundFile>false</useCompoundFile>
>
>    <mergeFactor>10</mergeFactor>
>    <ramBufferSizeMB>32</ramBufferSizeMB>
>    <maxFieldLength>10000</maxFieldLength>
>    <writeLockTimeout>1000</writeLockTimeout>
>    <commitLockTimeout>10000</commitLockTimeout>
>    <lockType>native</lockType>
>
>  </indexDefaults>
>
>  <mainIndex>
>
>    <useCompoundFile>false</useCompoundFile>
>    <ramBufferSizeMB>32</ramBufferSizeMB>
>    <mergeFactor>10</mergeFactor>
>
>    <unlockOnStartup>false</unlockOnStartup>
>
>    <reopenReaders>true</reopenReaders>
>
>
>    <deletionPolicy class="solr.SolrDeletionPolicy">
>      <str name="maxCommitsToKeep">1</str>
>      <str name="maxOptimizedCommitsToKeep">0</str>
>
>    </deletionPolicy>
>
>     <infoStream file="INFOSTREAM.txt">false</infoStream>
>
>  </mainIndex>
>
>  <jmx />
>
>   <updateHandler class="solr.DirectUpdateHandler2">
>
>  </updateHandler>
>
>
>
>  <query>
>    <maxBooleanClauses>1024</maxBooleanClauses>
>    <filterCache
>      class="solr.FastLRUCache"
>      size="512"
>      initialSize="512"
>      autowarmCount="0"/>
>    <queryResultCache
>      class="solr.LRUCache"
>      size="512"
>      initialSize="512"
>      autowarmCount="0"/>
>    <documentCache
>      class="solr.LRUCache"
>      size="512"
>      initialSize="512"
>      autowarmCount="0"/>
>    <enableLazyFieldLoading>true</enableLazyFieldLoading>
>    <queryResultWindowSize>20</queryResultWindowSize>
>    <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
>    <listener event="newSearcher" class="solr.QuerySenderListener">
>      <arr name="queries">
>      </arr>
>    </listener>
>    <listener event="firstSearcher" class="solr.QuerySenderListener">
>      <arr name="queries">
>        <lst> <str name="q">solr rocks</str><str name="start">0</str><str name="rows">10</str></lst>
>        <lst><str name="q">static firstSearcher warming query from solrconfig.xml</str></lst>
>      </arr>
>    </listener>
>    <useColdSearcher>false</useColdSearcher>
>    <maxWarmingSearchers>2</maxWarmingSearchers>
>
>  </query>
>  <requestDispatcher handleSelect="true" >
>    <requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="2048000" />
>    <httpCaching lastModifiedFrom="openTime"
>                 etagSeed="Solr">
>    </httpCaching>
>  </requestDispatcher>
>  <requestHandler name="standard" class="solr.SearchHandler" default="true">
>    <!-- default values for query parameters -->
>     <lst name="defaults">
>       <str name="echoParams">explicit</str>
>       <!--
>       <int name="rows">10</int>
>       <str name="fl">*</str>
>       <str name="version">2.1</str>
>        -->
>     </lst>
>  </requestHandler>
>  <requestHandler name="/browse" class="solr.SearchHandler">
>     <lst name="defaults">
>       <str name="wt">velocity</str>
>
>       <str name="v.template">browse</str>
>       <str name="v.layout">layout</str>
>       <str name="title">Solritas</str>
>
>       <str name="defType">dismax</str>
>       <str name="q.alt">*:*</str>
>       <str name="rows">10</str>
>       <str name="fl">*,score</str>
>
>       <str name="facet">on</str>
>       <str name="facet.field">cat</str>
>       <str name="facet.field">manu_exact</str>
>       <str name="facet.mincount">1</str>
>       <str name="qf">
>          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
>       </str>
>
>       <str name="hl">on</str>
>       <str name="hl.fl">text features name</str>
>       <str name="f.name.hl.fragsize">0</str>
>       <str name="f.name.hl.alternateField">name</str>
>     </lst>
>  </requestHandler>
>  <requestHandler name="dismax" class="solr.SearchHandler" >
>    <lst name="defaults">
>     <str name="defType">dismax</str>
>     <str name="echoParams">explicit</str>
>     <float name="tie">0.01</float>
>     <str name="qf">
>        text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
>     </str>
>     <str name="pf">
>        text^0.2 features^1.1 name^1.5 manu^1.4 manu_exact^1.9
>     </str>
>     <str name="bf">
>        popularity^0.5 recip(price,1,1000,1000)^0.3
>     </str>
>     <str name="fl">
>        id,name,price,score
>     </str>
>     <str name="mm">
>        2&lt;-1 5&lt;-2 6&lt;90%
>     </str>
>     <int name="ps">100</int>
>     <str name="q.alt">*:*</str>
>     <!-- example highlighter config, enable per-query with hl=true -->
>     <str name="hl.fl">text features name</str>
>     <!-- for this field, we want no fragmenting, just highlighting -->
>     <str name="f.name.hl.fragsize">0</str>
>     <!-- instructs Solr to return the field itself if no query terms are
>          found -->
>     <str name="f.name.hl.alternateField">name</str>
>     <str name="f.text.hl.fragmenter">regex</str> <!-- defined below -->
>    </lst>
>  </requestHandler>
>  <requestHandler name="partitioned" class="solr.SearchHandler" >
>    <lst name="defaults">
>     <str name="defType">dismax</str>
>     <str name="echoParams">explicit</str>
>     <str name="qf">text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0</str>
>     <str name="mm">2&lt;-1 5&lt;-2 6&lt;90%</str>
>     <!-- This is an example of using Date Math to specify a constantly
>          moving date range in a config...
>       -->
>     <str name="bq">incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2</str>
>    </lst>
>    <lst name="appends">
>      <str name="fq">inStock:true</str>
>    </lst>
>    <lst name="invariants">
>      <str name="facet.field">cat</str>
>      <str name="facet.field">manu_exact</str>
>      <str name="facet.query">price:[* TO 500]</str>
>      <str name="facet.query">price:[500 TO *]</str>
>    </lst>
>  </requestHandler>
>  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
>
>    <str name="queryAnalyzerFieldType">textSpell</str>
>
>    <lst name="spellchecker">
>      <str name="name">default</str>
>      <str name="field">name</str>
>      <str name="spellcheckIndexDir">./spellchecker</str>
>    </lst>
>  </searchComponent>
>  <requestHandler name="/spell" class="solr.SearchHandler" lazy="true">
>    <lst name="defaults">
>      <str name="spellcheck.onlyMorePopular">false</str>
>      <str name="spellcheck.extendedResults">false</str>
>      <str name="spellcheck.count">1</str>
>    </lst>
>    <arr name="last-components">
>      <str>spellcheck</str>
>    </arr>
>  </requestHandler>
>
>  <searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
>  <requestHandler name="tvrh" class="org.apache.solr.handler.component.SearchHandler">
>    <lst name="defaults">
>      <bool name="tv">true</bool>
>    </lst>
>    <arr name="last-components">
>      <str>tvComponent</str>
>    </arr>
>  </requestHandler>
>  <searchComponent
>    name="clusteringComponent"
>    enable="${solr.clustering.enabled:false}"
>    class="org.apache.solr.handler.clustering.ClusteringComponent" >
>    <lst name="engine">
>      <!-- The name, only one can be named "default" -->
>      <str name="name">default</str>
>      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
>      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
>    </lst>
>    <lst name="engine">
>      <str name="name">stc</str>
>      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
>    </lst>
>  </searchComponent>
>  <requestHandler name="/clustering"
>                  enable="${solr.clustering.enabled:false}"
>                  class="solr.SearchHandler">
>     <lst name="defaults">
>       <bool name="clustering">true</bool>
>       <str name="clustering.engine">default</str>
>       <bool name="clustering.results">true</bool>
>       <!-- The title field -->
>       <str name="carrot.title">name</str>
>       <str name="carrot.url">id</str>
>       <!-- The field to cluster on -->
>       <str name="carrot.snippet">features</str>
>       <!-- produce summaries -->
>       <bool name="carrot.produceSummary">true</bool>
>       <!-- the maximum number of labels per cluster -->
>       <!--<int name="carrot.numDescriptions">5</int>-->
>       <!-- produce sub clusters -->
>       <bool name="carrot.outputSubClusters">false</bool>
>    </lst>
>    <arr name="last-components">
>      <str>clusteringComponent</str>
>    </arr>
>  </requestHandler>
>
>  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" startup="lazy">
>    <lst name="defaults">
>      <str name="fmap.content">text</str>
>      <str name="lowernames">true</str>
>      <str name="uprefix">ignored_</str>
>
>      <!-- capture link hrefs but ignore div attributes -->
>      <str name="captureAttr">true</str>
>      <str name="fmap.a">links</str>
>      <str name="fmap.div">ignored_</str>
>    </lst>
>  </requestHandler>
>
>
>  <searchComponent name="termsComponent" class="org.apache.solr.handler.component.TermsComponent"/>
>
>  <requestHandler name="/terms" class="org.apache.solr.handler.component.SearchHandler">
>     <lst name="defaults">
>      <bool name="terms">true</bool>
>    </lst>
>    <arr name="components">
>      <str>termsComponent</str>
>    </arr>
>  </requestHandler>
>  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
>    <!-- pick a fieldType to analyze queries -->
>    <str name="queryFieldType">string</str>
>    <str name="config-file">elevate.xml</str>
>  </searchComponent>
>
>  <!-- a request handler utilizing the elevator component -->
>  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
>    <lst name="defaults">
>      <str name="echoParams">explicit</str>
>    </lst>
>    <arr name="last-components">
>      <str>elevator</str>
>    </arr>
>  </requestHandler>
>  <requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
>
>
>  <requestHandler name="/update/javabin" class="solr.BinaryUpdateRequestHandler" />
>
>  <requestHandler name="/analysis/document" class="solr.DocumentAnalysisRequestHandler" />
>  <requestHandler name="/analysis/field" class="solr.FieldAnalysisRequestHandler" />
>  <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
>  <requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" />
>  <requestHandler name="/admin/ping" class="PingRequestHandler">
>    <lst name="defaults">
>      <str name="qt">standard</str>
>      <str name="q">solrpingquery</str>
>      <str name="echoParams">all</str>
>    </lst>
>  </requestHandler>
>
>  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
>    <lst name="defaults">
>     <str name="echoParams">explicit</str> <!-- for all params (including the default etc) use: 'all' -->
>     <str name="echoHandler">true</str>
>    </lst>
>  </requestHandler>
>  <searchComponent class="solr.HighlightComponent" name="highlight">
>  <highlighting>
>   <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
>    <lst name="defaults">
>     <int name="hl.fragsize">100</int>
>    </lst>
>   </fragmenter>
>
>   <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
>    <lst name="defaults">
>      <!-- slightly smaller fragsizes work better because of slop -->
>      <int name="hl.fragsize">70</int>
>      <!-- allow 50% slop on fragment sizes -->
>      <float name="hl.regex.slop">0.5</float>
>      <!-- a basic sentence pattern -->
>      <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
>    </lst>
>   </fragmenter>
>
>   <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
>    <lst name="defaults">
>     <str name="hl.simple.pre"><![CDATA[<em>]]></str>
>     <str name="hl.simple.post"><![CDATA[</em>]]></str>
>    </lst>
>   </formatter>
>
>   <fragListBuilder name="simple" class="org.apache.solr.highlight.SimpleFragListBuilder" default="true"/>
>
>   <fragListBuilder name="single" class="org.apache.solr.highlight.SingleFragListBuilder"/>
>
>   <fragmentsBuilder name="colored" class="org.apache.solr.highlight.MultiColoredScoreOrderFragmentsBuilder" default="true"/>
>  </highlighting>
>  </searchComponent>
>  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
>    <int name="xsltCacheLifetimeSeconds">5</int>
>  </queryResponseWriter>
>  <admin>
>    <defaultQuery>solr</defaultQuery>
>  </admin>
>
> </config>
>
> Test1.txt document:
> Asdf
> Asdf
> Asdf
> Adsf
>
> Upload command:
> curl "http://localhost:8080/solr/update/extract?literal.id=123&uprefix=attr_&fmap.content=attr_content&commit=true" -F "myfile=@test1.txt”
>
> RESULTS from an id:[* TO *] query:
> <response>
> −
> <lst name="responseHeader">
> <int name="status">0</int>
> <int name="QTime">91</int>
> −
> <lst name="params">
> <str name="explainOther"/>
> <str name="fl">*,score</str>
> <str name="indent">on</str>
> <str name="start">0</str>
> <str name="q">id:[* TO *]</str>
> <str name="hl.fl"/>
> <str name="qt">standard</str>
> <str name="wt">standard</str>
> <str name="fq"/>
> <str name="rows">10</str>
> <str name="version">2.2</str>
> </lst>
> </lst>
> −
> <result name="response" numFound="1" start="0" maxScore="1.0">
> −
> <doc>
> <float name="score">1.0</float>
> −
> <arr name="attr_content">
> <str>        </str>
> </arr>
> −
> <arr name="attr_stream_content_type">
> <str>text/plain</str>
> </arr>
> −
> <arr name="attr_stream_name">
> <str>test1.txt</str>
> </arr>
> −
> <arr name="attr_stream_size">
> <str>24</str>
> </arr>
> −
> <arr name="attr_stream_source_info">
> <str>myfile</str>
> </arr>
> −
> <arr name="content_type">
> <str>text/plain</str>
> </arr>
> <str name="id">123</str>
> </doc>
> </result>
> </response>
>
> Note that the attr_content section of the response is blank.  Any help & hints would be GREATLY appreciated…=)
>
> Best,
> Dave
>



-- 
Lance Norskog
goksron@gmail.com