You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/02 15:27:30 UTC

[3/5] lucene-solr:jira/gradle: Add :solr:contrib:extraction module

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
deleted file mode 100644
index 475c333..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
+++ /dev/null
@@ -1,484 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- The Solr schema file. This file should be named "schema.xml" and
-     should be located where the classloader for the Solr webapp can find it.
-
-     This schema is used for testing, and as such has everything and the 
-     kitchen sink thrown in. See example/solr/conf/schema.xml for a 
-     more concise example.
-
-  -->
-
-<schema name="test" version="1.0">
-
-
-  <!--
-    Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
-  -->
-  <fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
-  <fieldType name="float" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
-  <fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
-  <fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
-
-  <!--
-   Numeric field types that index each value at various levels of precision
-   to accelerate range queries when the number of values between the range
-   endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
-   implementation details.
-
-   Smaller precisionStep values (specified in bits) will lead to more tokens
-   indexed per value, slightly larger index size, and faster range queries.
-   A precisionStep of 0 disables indexing at different precision levels.
-  -->
-  <fieldType name="tint" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
-  <fieldType name="tfloat" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
-  <fieldType name="tlong" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
-  <fieldType name="tdouble" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
-
-  <!-- Field type demonstrating an Analyzer failure -->
-  <fieldType name="failtype1" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
-              catenateNumbers="0" catenateAll="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <!-- Demonstrating ignoreCaseChange -->
-  <fieldType name="wdf_nocase" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
-              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
-              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="wdf_preserve" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
-              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
-              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-
-  <!-- HighlitText optimizes storage for (long) columns which will be highlit -->
-  <fieldType name="highlittext" class="solr.TextField" compressThreshold="345"/>
-
-  <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
-  <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
-
-  <!-- format for date is 1995-12-31T23:59:59.999Z and only the fractional
-       seconds part (.999) is optional.
-    -->
-  <fieldType name="date" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" sortMissingLast="true" omitNorms="true"/>
-
-  <!-- solr.TextField allows the specification of custom
-       text analyzers specified as a tokenizer and a list
-       of token filters.
-    -->
-  <fieldType name="text" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.ClassicTokenizerFactory"/>
-      <filter class="solr.ClassicFilterFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.StopFilterFactory"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-
-  <fieldType name="nametext" class="solr.TextField">
-    <analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
-  </fieldType>
-
-  <!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
-  <fieldType name="keywordtok" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="standardtok" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.StandardTokenizerFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="lettertok" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.LetterTokenizerFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="whitetok" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="HTMLstandardtok" class="solr.TextField">
-    <analyzer>
-      <charFilter class="solr.HTMLStripCharFilterFactory"/>
-      <tokenizer class="solr.StandardTokenizerFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="HTMLwhitetok" class="solr.TextField">
-    <analyzer>
-      <charFilter class="solr.HTMLStripCharFilterFactory"/>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="standardtokfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.ClassicTokenizerFactory"/>
-      <filter class="solr.ClassicFilterFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="standardfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.ClassicFilterFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="lowerfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="patternreplacefilt" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
-      <filter class="solr.PatternReplaceFilterFactory"
-              pattern="([^a-zA-Z])" replacement="_" replace="all"
-      />
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="porterfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-    </analyzer>
-  </fieldType>
-  <!-- fieldType name="snowballfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.SnowballPorterFilterFactory"/>
-    </analyzer>
-  </fieldType -->
-  <fieldType name="engporterfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="custengporterfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="stopfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.StopFilterFactory" ignoreCase="true"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="custstopfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
-    </analyzer>
-  </fieldType>
-  <fieldType name="lengthfilt" class="solr.TextField">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.LengthFilterFactory" min="2" max="5"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
-              catenateNumbers="1" catenateAll="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.StopFilterFactory"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
-              catenateNumbers="0" catenateAll="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.StopFilterFactory"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <!-- more flexible in matching skus, but more chance of a false match -->
-  <fieldType name="skutype1" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
-              catenateNumbers="1" catenateAll="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
-              catenateNumbers="1" catenateAll="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <!-- less flexible in matching skus, but less chance of a false match -->
-  <fieldType name="skutype2" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
-              catenateNumbers="1" catenateAll="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
-              catenateNumbers="1" catenateAll="0"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <!-- less flexible in matching skus, but less chance of a false match -->
-  <fieldType name="syn" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
-    </analyzer>
-  </fieldType>
-
-  <!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
-       synonyms "better"
-    -->
-  <fieldType name="dedup" class="solr.TextField">
-    <analyzer type="index">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.SynonymGraphFilterFactory"
-              synonyms="synonyms.txt" expand="true"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-    </analyzer>
-    <analyzer type="query">
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.SynonymGraphFilterFactory"
-              synonyms="synonyms.txt" expand="true"/>
-      <filter class="solr.PorterStemFilterFactory"/>
-      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-  <fieldType name="unstored" class="solr.StrField" indexed="true" stored="false"/>
-
-
-  <fieldType name="textgap" class="solr.TextField" multiValued="true" positionIncrementGap="100">
-    <analyzer>
-      <tokenizer class="solr.MockTokenizerFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
-    </analyzer>
-  </fieldType>
-
-
-  <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
-  <field name="name" type="nametext" indexed="true" stored="true"/>
-  <field name="text" type="text" indexed="true" stored="false"/>
-  <field name="subject" type="text" indexed="true" stored="true"/>
-  <field name="title" type="nametext" indexed="true" stored="true"/>
-  <field name="weight" type="float" indexed="true" stored="true"/>
-  <field name="bday" type="date" indexed="true" stored="true"/>
-
-  <field name="title_stemmed" type="text" indexed="true" stored="false"/>
-  <field name="title_lettertok" type="lettertok" indexed="true" stored="false"/>
-
-  <field name="syn" type="syn" indexed="true" stored="true"/>
-
-  <!-- to test property inheritance and overriding -->
-  <field name="shouldbeunstored" type="unstored"/>
-  <field name="shouldbestored" type="unstored" stored="true"/>
-  <field name="shouldbeunindexed" type="unstored" indexed="false" stored="true"/>
-
-
-  <!-- test different combinations of indexed and stored -->
-  <field name="bind" type="boolean" indexed="true" stored="false"/>
-  <field name="bsto" type="boolean" indexed="false" stored="true"/>
-  <field name="bindsto" type="boolean" indexed="true" stored="true"/>
-  <field name="isto" type="int" indexed="false" stored="true"/>
-  <field name="iind" type="int" indexed="true" stored="false"/>
-  <field name="ssto" type="string" indexed="false" stored="true"/>
-  <field name="sind" type="string" indexed="true" stored="false"/>
-  <field name="sindsto" type="string" indexed="true" stored="true"/>
-
-  <!-- test combinations of term vector settings -->
-  <field name="test_basictv" type="text" termVectors="true"/>
-  <field name="test_notv" type="text" termVectors="false"/>
-  <field name="test_postv" type="text" termVectors="true" termPositions="true"/>
-  <field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
-  <field name="test_posofftv" type="text" termVectors="true"
-         termPositions="true" termOffsets="true"/>
-
-  <!-- test highlit field settings -->
-  <field name="test_hlt" type="highlittext" indexed="true"/>
-  <field name="test_hlt_off" type="highlittext" indexed="true"/>
-
-  <!-- fields to test individual tokenizers and tokenfilters -->
-  <field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
-  <field name="standardtok" type="standardtok" indexed="true" stored="true"/>
-  <field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
-  <field name="lettertok" type="lettertok" indexed="true" stored="true"/>
-  <field name="whitetok" type="whitetok" indexed="true" stored="true"/>
-  <field name="HTMLwhitetok" type="HTMLwhitetok" indexed="true" stored="true"/>
-  <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
-  <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
-  <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
-  <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
-  <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
-  <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
-  <field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>
-  <field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
-  <field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
-  <field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
-  <field name="dedup" type="dedup" indexed="true" stored="true"/>
-  <field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
-  <field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
-
-  <field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>
-
-  <field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
-
-  <field name="subword" type="subword" indexed="true" stored="true"/>
-  <field name="sku1" type="skutype1" indexed="true" stored="true"/>
-  <field name="sku2" type="skutype2" indexed="true" stored="true"/>
-
-  <field name="textgap" type="textgap" indexed="true" stored="true"/>
-
-  <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
-  <field name="multiDefault" type="string" indexed="true" stored="true" default="muLti-Default" multiValued="true"/>
-  <field name="intDefault" type="int" indexed="true" stored="true" default="42" multiValued="false"/>
-
-  <field name="extractedDate" type="date" indexed="true" stored="true" multiValued="true"/>
-  <field name="extractedContent" type="text" indexed="true" stored="true" multiValued="true"/>
-  <field name="extractedProducer" type="text" indexed="true" stored="true" multiValued="true"/>
-  <field name="extractedCreator" type="text" indexed="true" stored="true" multiValued="true"/>
-  <field name="extractedKeywords" type="text" indexed="true" stored="true" multiValued="true"/>
-  <field name="extractedAuthor" type="text" indexed="true" stored="true" multiValued="true"/>
-  <field name="extractedLanguage" type="string" indexed="true" stored="true" multiValued="true"/>
-  <field name="resourceName" type="string" indexed="true" stored="true" multiValued="true"/>
-
-  <field name="extractionLiteralMV" type="string" indexed="true" stored="true" multiValued="true"/>
-  <field name="extractionLiteral" type="string" indexed="true" stored="true" multiValued="false"/>
-
-  <field name="defaultExtr" type="string" indexed="true" stored="false"/>
-
-  <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
-       will be used if the name matches any of the patterns.
-       RESTRICTION: the glob-like pattern in the name attribute must have
-       a "*" only at the start or the end.
-       EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
-       Longer patterns will be matched first.  if equal size patterns
-       both match, the first appearing in the schema will be used.
-  -->
-  <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
-  <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
-  <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
-  <dynamicField name="*_s1" type="string" indexed="true" stored="true" multiValued="false"/>
-  <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
-  <dynamicField name="*_t" type="text" indexed="true" stored="true"/>
-  <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
-  <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
-  <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
-  <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
-  <dynamicField name="*_dt1" type="date" indexed="true" stored="true" multiValued="false"/>
-
-  <dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
-  <dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
-  <dynamicField name="t_*" type="text" indexed="true" stored="true"/>
-  <dynamicField name="tv_*" type="text" indexed="true" stored="true"
-                termVectors="true" termPositions="true" termOffsets="true"/>
-
-  <dynamicField name="stream_*" type="text" indexed="true" stored="true"/>
-  <dynamicField name="Content*" type="text" indexed="true" stored="true"/>
-
-
-  <!-- special fields for dynamic copyField test -->
-  <dynamicField name="dynamic_*" type="string" indexed="true" stored="true"/>
-  <dynamicField name="*_dynamic" type="string" indexed="true" stored="true"/>
-
-  <!-- for testing to ensure that longer patterns are matched first -->
-  <dynamicField name="*aa" type="string" indexed="true" stored="true"/>
-  <dynamicField name="*aaa" type="int" indexed="false" stored="true"/>
-
-  <!-- ignored because not stored or indexed -->
-  <dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
-
-
-  <uniqueKey>id</uniqueKey>
-
-  <!-- copyField commands copy one field to another at the time a document
-        is added to the index.  It's used either to index the same field different
-        ways, or to add multiple fields to the same field for easier/faster searching.
-   -->
-  <copyField source="title" dest="title_stemmed"/>
-  <copyField source="title" dest="title_lettertok"/>
-
-  <copyField source="title" dest="text"/>
-  <copyField source="subject" dest="text"/>
-
-  <copyField source="*_t" dest="text"/>
-
-  <!-- dynamic destination -->
-  <copyField source="*_dynamic" dest="dynamic_*"/>
-
-
-</schema>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
deleted file mode 100644
index aef03af..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
+++ /dev/null
@@ -1,214 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<config>
-  <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
-  <jmx />
-  <indexConfig>
-    <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
-  </indexConfig>
-
-  <!-- Used to specify an alternate directory to hold all index data.
-       It defaults to "index" if not present, and should probably
-       not be changed if replication is in use. -->
-  <dataDir>${solr.data.dir:}</dataDir>
-  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
-  <schemaFactory class="ClassicIndexSchemaFactory"/>
-
-  <updateHandler class="solr.DirectUpdateHandler2">
-
-    <!-- autocommit pending docs if certain criteria are met 
-    <autoCommit> 
-      <maxDocs>10000</maxDocs>
-      <maxTime>3600000</maxTime> 
-    </autoCommit>
-    -->
-
-  </updateHandler>
-
-
-  <updateProcessor class="solr.ParseDateFieldUpdateProcessorFactory" name="parse-date">
-    <arr name="format">
-      <str>yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z</str>
-      <str>yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z</str>
-      <str>yyyy-MM-dd HH:mm[:ss[.SSS]][z</str>
-      <str>yyyy-MM-dd HH:mm[:ss[,SSS]][z</str>
-      <str>[EEE, ]dd MMM yyyy HH:mm[:ss] z</str>
-      <str>EEEE, dd-MMM-yy HH:mm:ss z</str>
-      <str>EEE MMM ppd HH:mm:ss [z ]yyyy</str>
-    </arr>
-  </updateProcessor>
-
-  <!-- The update.autoCreateFields property can be turned to false to disable schemaless mode -->
-  <updateRequestProcessorChain name="parse-date" default="true"
-                               processor="parse-date">
-    <processor class="solr.RunUpdateProcessorFactory"/>
-  </updateRequestProcessorChain>
-
-
-  <query>
-    <!-- Maximum number of clauses in a boolean query... can affect
-        range or wildcard queries that expand to big boolean
-        queries.  An exception is thrown if exceeded.
-    -->
-    <maxBooleanClauses>1024</maxBooleanClauses>
-
-
-    <!-- Cache specification for Filters or DocSets - unordered set of *all* documents
-         that match a particular query.
-      -->
-    <filterCache
-      class="solr.search.LRUCache"
-      size="512"
-      initialSize="512"
-      autowarmCount="256"/>
-
-    <queryResultCache
-      class="solr.search.LRUCache"
-      size="512"
-      initialSize="512"
-      autowarmCount="1024"/>
-
-    <documentCache
-      class="solr.search.LRUCache"
-      size="512"
-      initialSize="512"
-      autowarmCount="0"/>
-
-    <!-- If true, stored fields that are not requested will be loaded lazily.
-    -->
-    <enableLazyFieldLoading>true</enableLazyFieldLoading>
-
-    <!--
-
-    <cache name="myUserCache"
-      class="solr.search.LRUCache"
-      size="4096"
-      initialSize="1024"
-      autowarmCount="1024"
-      regenerator="MyRegenerator"
-      />
-    -->
-
-
-    <useFilterForSortedQuery>true</useFilterForSortedQuery>
-
-    <queryResultWindowSize>10</queryResultWindowSize>
-
-    <!-- set maxSize artificially low to exercise both types of sets -->
-    <HashDocSet maxSize="3" loadFactor="0.75"/>
-
-
-    <!-- boolToFilterOptimizer converts boolean clauses with zero boost
-         into cached filters if the number of docs selected by the clause exceeds
-         the threshold (represented as a fraction of the total index)
-    -->
-    <boolTofilterOptimizer enabled="false" cacheSize="32" threshold=".05"/>
-
-
-    <!-- a newSearcher event is fired whenever a new searcher is being prepared
-         and there is a current searcher handling requests (aka registered). -->
-    <!-- QuerySenderListener takes an array of NamedList and executes a
-         local query request for each NamedList in sequence. -->
-    <!--
-    <listener event="newSearcher" class="solr.QuerySenderListener">
-      <arr name="queries">
-        <lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
-        <lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
-      </arr>
-    </listener>
-    -->
-
-    <!-- a firstSearcher event is fired whenever a new searcher is being
-         prepared but there is no current registered searcher to handle
-         requests or to gain prewarming data from. -->
-    <!--
-    <listener event="firstSearcher" class="solr.QuerySenderListener">
-      <arr name="queries">
-        <lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
-      </arr>
-    </listener>
-    -->
-
-
-  </query>
-
-
-  <requestHandler name="/select" class="solr.SearchHandler">
-    <bool name="httpCaching">true</bool>
-  </requestHandler>
-
-  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
-    <str name="parseContext.config">parseContext.xml</str>
-  </requestHandler>
-
-  <requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
-    <lst name="defaults">
-      <str name="literal.foo_s">x</str>
-    </lst>
-    <lst name="appends">
-      <str name="literal.bar_s">y</str>
-    </lst>
-    <lst name="invariants">
-      <str name="literal.zot_s">z</str>
-      <str name="uprefix">ignored_</str>
-    </lst>
-  </requestHandler>
-
-  <highlighting>
-   <!-- Configure the standard fragmenter -->
-   <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
-    <lst name="defaults">
-     <int name="hl.fragsize">100</int>
-    </lst>
-   </fragmenter>
-
-   <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
-    <lst name="defaults">
-     <int name="hl.fragsize">70</int>
-    </lst>
-   </fragmenter>
-
-   <!-- Configure the standard formatter -->
-   <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
-    <lst name="defaults">
-     <str name="hl.simple.pre"><![CDATA[<em>]]></str>
-     <str name="hl.simple.post"><![CDATA[</em>]]></str>
-    </lst>
-   </formatter>
-  </highlighting>
-
-
-  <requestDispatcher>
-    <requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="-1" />
-    <httpCaching lastModifiedFrom="openTime" etagSeed="Solr" never304="false">
-      <cacheControl>max-age=30, public</cacheControl>
-    </httpCaching>
-  </requestDispatcher>
-
-  <admin>
-    <defaultQuery>solr</defaultQuery>
-    <gettableFiles>solrconfig.xml schema.xml</gettableFiles>
-  </admin>
-
-  <!-- test getting system property -->
-  <propTest attr1="${solr.test.sys.prop1}-$${literal}"
-            attr2="${non.existent.sys.prop:default-from-config}">prefix-${solr.test.sys.prop2}-suffix</propTest>
-
-</config>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt
deleted file mode 100644
index 688e307..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-stopworda
-stopwordb

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt
deleted file mode 100644
index a7624f0..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-a => aa
-b => b1 b2
-c => c1,c2
-a\=>a => b\=>b
-a\,a => b\,b
-foo,bar,baz
-
-Television,TV,Televisions

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz b/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz
deleted file mode 100644
index f5df886..0000000
Binary files a/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz and /dev/null differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/version_control.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/version_control.txt b/solr/contrib/extraction/src/test-files/extraction/version_control.txt
deleted file mode 100644
index 7a89c5b..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/version_control.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-Solr Version Control System
- 
-Overview
- 
-The Solr source code resides in the Apache Subversion (SVN) repository.
-The command-line SVN client can be obtained here or as an optional package
-for cygwin.
-
-The TortoiseSVN GUI client for Windows can be obtained here. There
-are also SVN plugins available for older versions of Eclipse and 
-IntelliJ IDEA that don't have subversion support already included.
-
--------------------------------
-
-Note: This document is an excerpt from a document Licensed to the
-Apache Software Foundation (ASF) under one or more contributor
-license agreements. See the XML version (version_control.xml) for
-more details.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/version_control.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/version_control.xml b/solr/contrib/extraction/src/test-files/extraction/version_control.xml
deleted file mode 100644
index 4e09960..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/version_control.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0"?>
-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<document>
-  
-  <header>
-    <title>Solr Version Control System</title>
-  </header>
-  
-  <body>
-  
-    <section>
-      <title>Overview</title>
-      <p>
-        The Solr source code resides in the Apache <a href="http://subversion.tigris.org/">Subversion (SVN)</a> repository.
-        The command-line SVN client can be obtained <a href="http://subversion.tigris.org/project_packages.html">here</a> or as an optional package for <a href="http://www.cygwin.com/">cygwin</a>.
-        The TortoiseSVN GUI client for Windows can be obtained <a href="http://tortoisesvn.tigris.org/">here</a>. There
-        are also SVN plugins available for older versions of <a href="http://subclipse.tigris.org/">Eclipse</a> and 
-        <a href="http://svnup.tigris.org/">IntelliJ IDEA</a> that don't have subversion support already included.
-      </p>
-    </section>
-    <p>Here is some more text.  It contains <a href="http://lucene.apache.org">a link</a>. </p>
-    <p>Text Here</p>
-  </body>
-  
-</document>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/word2003.doc
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/word2003.doc b/solr/contrib/extraction/src/test-files/extraction/word2003.doc
deleted file mode 100644
index e55827b..0000000
Binary files a/solr/contrib/extraction/src/test-files/extraction/word2003.doc and /dev/null differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
new file mode 100644
index 0000000..132b371
--- /dev/null
+++ b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -0,0 +1,777 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.LocalSolrQueryRequest;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.processor.BufferingRequestProcessor;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+/**
+ *
+ *
+ **/
+public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    // Is the JDK/env affected by a known bug?
+    final String tzDisplayName = TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
+    if (!tzDisplayName.matches("[A-Z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) {
+      assertTrue("Is some other JVM affected?  Or bad regex? TzDisplayName: " + tzDisplayName,
+          System.getProperty("java.version").startsWith("11"));
+      assumeTrue("SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in extracting dates in a bad format.", false);
+    }
+
+    initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr").getAbsolutePath());
+  }
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    clearIndex();
+    assertU(commit());
+  }
+
+  @Test
+  public void testExtraction() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+    loadLocal("extraction/solr-word.pdf",
+            "fmap.created", "extractedDate",
+            "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Creation-Date", "extractedDate",
+            "uprefix", "ignored_",
+            "fmap.Author", "extractedAuthor",
+            "fmap.content", "extractedContent",
+           "literal.id", "one",
+            "fmap.Last-Modified", "extractedDate"
+    );
+    assertQ(req("title:solr-word"), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("title:solr-word"), "//*[@numFound='1']");
+
+
+    loadLocal("extraction/simple.html", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "fmap.language", "extractedLanguage",
+            "literal.id", "two",
+            "uprefix", "ignored_",
+            "fmap.content", "extractedContent",
+            "fmap.Last-Modified", "extractedDate"
+    );
+    assertQ(req("title:Welcome"), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("title:Welcome"), "//*[@numFound='1']");
+
+    assertQ(req("extractedContent:distinctwords"),      "//*[@numFound='0']");
+    assertQ(req("extractedContent:distinct"),           "//*[@numFound='1']");
+    assertQ(req("extractedContent:words"),              "//*[@numFound='2']");
+    assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");
+
+    loadLocal("extraction/simple.html",
+      "literal.id","simple2",
+      "uprefix", "t_",
+      "lowernames", "true",
+      "captureAttr", "true",
+      "fmap.a","t_href",
+      "fmap.content_type", "abcxyz",  // test that lowernames is applied before mapping, and uprefix is applied after mapping
+      "commit", "true"  // test immediate commit
+    );
+
+    // test that purposely causes a failure to print out the doc for test debugging
+    // assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
+
+    // test both lowernames and unknown field mapping
+    //assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
+    assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
+    assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
+    assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
+    assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded
+
+    // make sure the fact there is an index-time boost does not fail the parsing
+    loadLocal("extraction/simple.html",
+      "literal.id","simple3",
+      "uprefix", "t_",
+      "lowernames", "true",
+      "captureAttr", "true",  "fmap.a","t_href",
+      "commit", "true"
+
+      ,"boost.t_href", "100.0"
+    );
+
+    assertQ(req("t_href:http"), "//*[@numFound='2']");
+    assertQ(req("t_href:http"), "//doc[2]/str[.='simple3']");
+    assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix
+
+    loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "three",
+            "uprefix", "ignored_",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Last-Modified", "extractedDate"
+    );
+    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
+
+    loadLocal("extraction/word2003.doc", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "four",
+            "uprefix", "ignored_",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Last-Modified", "extractedDate"
+    );
+    assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='0']");
+    // There is already a PDF file with this content:
+    assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='1']");
+    assertU(commit());
+    assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='1']");
+    // now 2 of them:
+    assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='2']");
+
+    // compressed file
+    loadLocal("extraction/tiny.txt.gz", 
+              "fmap.created", "extractedDate", 
+              "fmap.producer", "extractedProducer",
+              "fmap.creator", "extractedCreator", 
+              "fmap.Keywords", "extractedKeywords",
+              "fmap.Author", "extractedAuthor",
+              "uprefix", "ignored_",
+              "fmap.content", "extractedContent",
+              "fmap.language", "extractedLanguage",
+              "fmap.Last-Modified", "extractedDate",
+              "literal.id", "tiny.txt.gz");
+    assertU(commit());
+    assertQ(req("id:tiny.txt.gz")
+            , "//*[@numFound='1']"
+            , "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']"
+            );
+
+    // compressed file
+    loadLocal("extraction/open-document.odt", 
+              "uprefix", "ignored_",
+              "fmap.content", "extractedContent",
+              "literal.id", "open-document");
+    assertU(commit());
+    assertQ(req("extractedContent:\"Práctica sobre GnuPG\"")
+            , "//*[@numFound='1']"
+            , "//*/arr[@name='stream_name']/str[.='open-document.odt']"
+            );
+  }
+
+  @Test
+  public void testCapture() throws Exception {
+    loadLocal("extraction/simple.html",
+        "literal.id","capture1",
+        "uprefix","t_",
+        "capture","div",
+        "fmap.div", "foo_t",
+        "commit", "true"
+    );
+    assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
+    assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
+
+    loadLocal("extraction/simple.html",
+        "literal.id", "capture2",
+        "captureAttr", "true",
+        "defaultField", "text",
+        "fmap.div", "div_t",
+        "fmap.a", "anchor_t",
+        "capture", "div",
+        "capture", "a",
+        "commit", "true"
+    );
+    assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
+    assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
+    assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
+    assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
+  }
+
+  @Test
+  public void testDefaultField() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+    try {
+      ignoreException("unknown field 'a'");
+      ignoreException("unknown field 'meta'");  // TODO: should this exception be happening?
+      loadLocal("extraction/simple.html",
+      "literal.id","simple2",
+      "lowernames", "true",
+        "captureAttr", "true",
+        //"fmap.content_type", "abcxyz",
+        "commit", "true"  // test immediate commit
+      );
+      fail("Should throw SolrException");
+    } catch (SolrException e) {
+      //do nothing
+    } finally {
+      resetExceptionIgnores();
+    }
+    
+
+    loadLocal("extraction/simple.html",
+      "literal.id","simple2",
+      ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
+      "lowernames", "true",
+      "captureAttr", "true",
+      //"fmap.content_type", "abcxyz",
+      "commit", "true"  // test immediate commit
+    );
+    assertQ(req("id:simple2"), "//*[@numFound='1']");
+    assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"), "//*[@numFound='1']");
+
+    //Test when both uprefix and default are specified.
+    loadLocal("extraction/simple.html",
+      "literal.id","simple2",
+      ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
+            ExtractingParams.UNKNOWN_FIELD_PREFIX, "t_",
+      "lowernames", "true",
+      "captureAttr", "true",
+      "fmap.a","t_href",
+      //"fmap.content_type", "abcxyz",
+      "commit", "true"  // test immediate commit
+    );
+    assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
+  }
+
+  @Test
+  public void testLiterals() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+    //test literal
+    loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "fmap.content", "extractedContent",
+            "literal.id", "one",
+            "uprefix", "ignored_",
+            "fmap.language", "extractedLanguage",
+            "literal.extractionLiteralMV", "one",
+            "literal.extractionLiteralMV", "two",
+            "fmap.Last-Modified", "extractedDate"
+
+    );
+    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
+
+    assertQ(req("extractionLiteralMV:one"), "//*[@numFound='1']");
+    assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']");
+
+    try {
+      loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+              "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+              "fmap.Author", "extractedAuthor",
+              "fmap.content", "extractedContent",
+              "literal.id", "two",
+              "fmap.language", "extractedLanguage",
+              "literal.extractionLiteral", "one",
+              "literal.extractionLiteral", "two",
+              "fmap.X-Parsed-By", "ignored_parser",
+              "fmap.Last-Modified", "extractedDate"
+      );
+      // TODO: original author did not specify why an exception should be thrown... how to fix?
+      // assertTrue("Exception should have been thrown", false);
+    } catch (SolrException e) {
+      //nothing to see here, move along
+    }
+
+    loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "fmap.content", "extractedContent",
+            "literal.id", "three",
+            "fmap.language", "extractedLanguage",
+            "literal.extractionLiteral", "one",
+            "fmap.X-Parsed-By", "ignored_parser",
+            "fmap.Last-Modified", "extractedDate"
+    );
+    assertU(commit());
+    assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
+
+  }
+
+  public void testLiteralDefaults() throws Exception {
+
+    // sanity check config
+    loadLocalFromHandler("/update/extract/lit-def",
+                         "extraction/simple.html",
+                         "literal.id", "lit-def-simple");
+    assertU(commit());
+    assertQ(req("q", "id:lit-def-simple")
+            , "//*[@numFound='1']"
+            , "count(//arr[@name='foo_s']/str)=1"
+            , "//arr[@name='foo_s']/str[.='x']"
+            , "count(//arr[@name='bar_s']/str)=1"
+            , "//arr[@name='bar_s']/str[.='y']"
+            , "count(//arr[@name='zot_s']/str)=1"
+            , "//arr[@name='zot_s']/str[.='z']"
+            ); 
+    
+    // override the default foo_s
+    loadLocalFromHandler("/update/extract/lit-def",
+                         "extraction/simple.html",
+                         "literal.foo_s", "1111",
+                         "literal.id", "lit-def-simple");
+    assertU(commit());
+    assertQ(req("q", "id:lit-def-simple")
+            , "//*[@numFound='1']"
+            , "count(//arr[@name='foo_s']/str)=1"
+            , "//arr[@name='foo_s']/str[.='1111']"
+            , "count(//arr[@name='bar_s']/str)=1"
+            , "//arr[@name='bar_s']/str[.='y']"
+            , "count(//arr[@name='zot_s']/str)=1"
+            , "//arr[@name='zot_s']/str[.='z']"
+            ); 
+
+    // pre-pend the bar_s
+    loadLocalFromHandler("/update/extract/lit-def",
+                         "extraction/simple.html",
+                         "literal.bar_s", "2222",
+                         "literal.id", "lit-def-simple");
+    assertU(commit());
+    assertQ(req("q", "id:lit-def-simple")
+            , "//*[@numFound='1']"
+            , "count(//arr[@name='foo_s']/str)=1"
+            , "//arr[@name='foo_s']/str[.='x']"
+            , "count(//arr[@name='bar_s']/str)=2"
+            , "//arr[@name='bar_s']/str[.='2222']"
+            , "//arr[@name='bar_s']/str[.='y']"
+            , "count(//arr[@name='zot_s']/str)=1"
+            , "//arr[@name='zot_s']/str[.='z']"
+            ); 
+
+    // invariant zot_s can not be changed
+    loadLocalFromHandler("/update/extract/lit-def",
+                         "extraction/simple.html",
+                         "literal.zot_s", "3333",
+                         "literal.id", "lit-def-simple");
+    assertU(commit());
+    assertQ(req("q", "id:lit-def-simple")
+            , "//*[@numFound='1']"
+            , "count(//arr[@name='foo_s']/str)=1"
+            , "//arr[@name='foo_s']/str[.='x']"
+            , "count(//arr[@name='bar_s']/str)=1"
+            , "//arr[@name='bar_s']/str[.='y']"
+            , "count(//arr[@name='zot_s']/str)=1"
+            , "//arr[@name='zot_s']/str[.='z']"
+            ); 
+    
+  }
+
+  @Test
+  public void testPlainTextSpecifyingMimeType() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+
+    // Load plain text specifying MIME type:
+    loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "one",
+            "fmap.language", "extractedLanguage",
+            "fmap.X-Parsed-By", "ignored_parser",
+            "fmap.content", "extractedContent",
+            ExtractingParams.STREAM_TYPE, "text/plain"
+    );
+    assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
+  }
+
+  @Test
+  public void testPlainTextSpecifyingResourceName() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+
+    // Load plain text specifying filename
+    loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "one",
+            "fmap.language", "extractedLanguage",
+            "fmap.X-Parsed-By", "ignored_parser",
+            "fmap.content", "extractedContent",
+            ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt"
+    );
+    assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
+  }
+
+  @Test
+  public void testCommitWithin() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+    
+    SolrQueryRequest req = req("literal.id", "one",
+                               ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt",
+                               "commitWithin", "200"
+                               );
+    SolrQueryResponse rsp = new SolrQueryResponse();
+    BufferingRequestProcessor p = new BufferingRequestProcessor(null);
+
+    ExtractingDocumentLoader loader = (ExtractingDocumentLoader) handler.newLoader(req, p);
+    loader.load(req, rsp, new ContentStreamBase.FileStream(getFile("extraction/version_control.txt")),p);
+
+    AddUpdateCommand add = p.addCommands.get(0);
+    assertEquals(200, add.commitWithin);
+
+    req.close();
+  }
+
+  // Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's
+  // automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser
+
+  @Test
+  public void testExtractOnly() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+    SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true");
+    assertTrue("rsp is null and it shouldn't be", rsp != null);
+    NamedList list = rsp.getValues();
+
+    String extraction = (String) list.get("solr-word.pdf");
+    assertTrue("extraction is null and it shouldn't be", extraction != null);
+    assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
+
+    NamedList nl = (NamedList) list.get("solr-word.pdf_metadata");
+    assertTrue("nl is null and it shouldn't be", nl != null);
+    Object title = nl.get("title");
+    assertTrue("title is null and it shouldn't be", title != null);
+    assertTrue(extraction.indexOf("<?xml") != -1);
+
+    rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
+            ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
+    assertTrue("rsp is null and it shouldn't be", rsp != null);
+    list = rsp.getValues();
+
+    extraction = (String) list.get("solr-word.pdf");
+    assertTrue("extraction is null and it shouldn't be", extraction != null);
+    assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
+    assertTrue(extraction.indexOf("<?xml") == -1);
+
+    nl = (NamedList) list.get("solr-word.pdf_metadata");
+    assertTrue("nl is null and it shouldn't be", nl != null);
+    title = nl.get("title");
+    assertTrue("title is null and it shouldn't be", title != null);
+
+
+
+  }
+
+  @Test
+  public void testXPath() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+    SolrQueryResponse rsp = loadLocal("extraction/example.html",
+            ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
+            ExtractingParams.EXTRACT_ONLY, "true"
+    );
+    assertTrue("rsp is null and it shouldn't be", rsp != null);
+    NamedList list = rsp.getValues();
+    String val = (String) list.get("example.html");
+    assertEquals("News", val.trim()); //there is only one matching <a> tag
+
+    loadLocal("extraction/example.html",
+        "literal.id", "example1",
+        "captureAttr", "true",
+        "defaultField", "text",
+        "capture", "div",
+        "fmap.div", "foo_t",
+        "boost.foo_t", "3",
+        "xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
+        "commit", "true"
+    );
+    assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
+  }
+
+  /** test arabic PDF extraction is functional */
+  @Test
+  public void testArabicPDF() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) 
+      h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+
+    loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "fmap.Author", "extractedAuthor",
+        "uprefix", "ignored_",
+        "fmap.content", "wdf_nocase",
+       "literal.id", "one",
+        "fmap.Last-Modified", "extractedDate");
+    assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
+    assertU(commit());
+    assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
+  }
+
+  @Test
+  public void testTikaExceptionHandling() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) 
+      h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+
+    try{
+      loadLocal("extraction/password-is-solrcell.docx",
+          "literal.id", "one");
+      fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
+    }
+    catch(Exception expected){}
+    assertU(commit());
+    assertQ(req("*:*"), "//result[@numFound=0]");
+
+    try{
+      loadLocal("extraction/password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+          "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+          "fmap.Creation-Date", "extractedDate",
+          "uprefix", "ignored_",
+          "fmap.Author", "extractedAuthor",
+          "fmap.content", "wdf_nocase",
+          "literal.id", "one",
+          "ignoreTikaException", "true",  // set ignore flag
+          "fmap.Last-Modified", "extractedDate");
+    }
+    catch(Exception e){
+      fail("TikaException should be ignored.");
+    }
+    assertU(commit());
+    assertQ(req("*:*"), "//result[@numFound=1]");
+  }
+  
+  @Test
+  public void testWrongStreamType() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+
+    try{
+      // Load plain text specifying another mime type, should fail
+      loadLocal("extraction/version_control.txt", 
+              "literal.id", "one",
+              ExtractingParams.STREAM_TYPE, "application/pdf"
+      );
+      fail("SolrException is expected because wrong parser specified for the file type");
+    }
+    catch(Exception expected){}
+
+    try{
+      // Load plain text specifying non existing mimetype, should fail
+      loadLocal("extraction/version_control.txt", 
+              "literal.id", "one",
+              ExtractingParams.STREAM_TYPE, "foo/bar"
+      );
+      fail("SolrException is expected because nonexsisting parser specified");
+    }
+    catch(Exception expected){}
+  }
+
+  public void testLiteralsOverride() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+ 
+    assertQ(req("*:*"), "//*[@numFound='0']");
+
+    // Here Tika should parse out a title for this document:
+    loadLocal("extraction/solr-word.pdf", 
+            "fmap.created", "extractedDate", 
+            "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", 
+            "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "three",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Creation-Date", "extractedDate",
+            "uprefix", "ignored_",
+            "fmap.Last-Modified", "extractedDate");
+
+    // Here the literal value should override the Tika-parsed title:
+    loadLocal("extraction/solr-word.pdf",
+            "literal.title", "wolf-man",
+            "fmap.created", "extractedDate",
+            "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator",
+            "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "four",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Creation-Date", "extractedDate",
+            "uprefix", "ignored_",
+            "fmap.Last-Modified", "extractedDate");
+
+    // Here we mimic the old behaviour where literals are added, not overridden
+    loadLocal("extraction/solr-word.pdf",
+            "literalsOverride", "false",
+            // Trick - we first map the metadata-title to an ignored field before we replace with literal title
+            "fmap.title", "ignored_a",
+            "literal.title", "old-behaviour",
+            "literal.extractedKeywords", "literalkeyword",
+            "fmap.created", "extractedDate",
+            "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator",
+            "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "five",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Creation-Date", "extractedDate",
+            "uprefix", "ignored_",
+            "fmap.Last-Modified", "extractedDate");
+
+    assertU(commit());
+
+    assertQ(req("title:solr-word"), "//*[@numFound='1']");
+    assertQ(req("title:wolf-man"), "//*[@numFound='1']");
+    assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
+  }
+
+  @Test
+  public void testPdfWithImages() throws Exception {
+    //Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
+    loadLocal("extraction/pdf-with-image.pdf",
+        "fmap.created", "extractedDate",
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator",
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfWithImage",
+        "resource.name", "pdf-with-image.pdf",
+        "resource.password", "solrRules",
+        "fmap.Last-Modified", "extractedDate");
+
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
+  }
+
+  @Test
+  public void testPasswordProtected() throws Exception {
+    // PDF, Passwords from resource.password
+    loadLocal("extraction/encrypted-password-is-solrRules.pdf",
+        "fmap.created", "extractedDate", 
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfpwliteral",
+        "resource.name", "encrypted-password-is-solrRules.pdf",
+        "resource.password", "solrRules",
+        "fmap.Last-Modified", "extractedDate");
+
+    // PDF, Passwords from passwords property file
+    loadLocal("extraction/encrypted-password-is-solrRules.pdf",
+        "fmap.created", "extractedDate", 
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfpwfile",
+        "resource.name", "encrypted-password-is-solrRules.pdf",
+        "passwordsFile", "passwordRegex.properties", // Passwords-file
+        "fmap.Last-Modified", "extractedDate");
+
+    // DOCX, Explicit password
+    loadLocal("extraction/password-is-Word2010.docx", 
+        "fmap.created", "extractedDate", 
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "uprefix", "ignored_",
+        "literal.id", "docxpwliteral",
+        "resource.name", "password-is-Word2010.docx",
+        "resource.password", "Word2010", // Explicit password
+        "fmap.Last-Modified", "extractedDate");
+
+    // DOCX, Passwords from file
+    loadLocal("extraction/password-is-Word2010.docx", 
+        "fmap.created", "extractedDate",
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "docxpwfile",
+        "resource.name", "password-is-Word2010.docx",
+        "passwordsFile", "passwordRegex.properties", // Passwords-file
+        "fmap.Last-Modified", "extractedDate");
+    
+    assertU(commit());
+    Thread.sleep(100);
+    assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
+    assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
+  }
+  
+  SolrQueryResponse loadLocalFromHandler(String handler, String filename, 
+                                         String... args) throws Exception {
+                              
+    LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
+    try {
+      // TODO: stop using locally defined streams once stream.file and
+      // stream.body work everywhere
+      List<ContentStream> cs = new ArrayList<>();
+      cs.add(new ContentStreamBase.FileStream(getFile(filename)));
+      req.setContentStreams(cs);
+      return h.queryAndResponse(handler, req);
+    } finally {
+      req.close();
+    }
+  }
+
+  SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
+    return loadLocalFromHandler("/update/extract", filename, args);
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java
new file mode 100644
index 0000000..8aeeaad
--- /dev/null
+++ b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.nio.file.Paths;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+public class ParseContextConfigTest extends SolrTestCaseJ4 {
+
+  public void  testAll() throws Exception {
+    Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
+    Element entries = document.createElement("entries");
+    Element entry = document.createElement("entry");
+
+
+    entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
+    entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
+
+    Element property = document.createElement("property");
+
+    property.setAttribute("name", "extractInlineImages");
+    property.setAttribute("value", "true");
+    entry.appendChild(property);
+    entries.appendChild(entry);
+
+    ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader(Paths.get(".")), entries).create();
+
+    PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+
+    assertEquals(true, pdfParserConfig.getExtractInlineImages());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
new file mode 100644
index 0000000..7d37844
--- /dev/null
+++ b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.time.Instant;
+import java.util.Date;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.QueryResponseWriter;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.response.RawResponseWriter;
+import org.apache.solr.search.SolrReturnFields;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
+
+  private static XLSXResponseWriter writerXlsx;
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    System.setProperty("enable.update.log", "false");
+    initCore("solrconfig.xml","schema.xml",getFile("extraction/solr").getAbsolutePath());
+    createIndex();
+    //find a reference to the default response writer so we can redirect its output later
+    SolrCore testCore = h.getCore();
+    QueryResponseWriter writer = testCore.getQueryResponseWriter("xlsx");
+    if (writer instanceof XLSXResponseWriter) {
+      writerXlsx = (XLSXResponseWriter) testCore.getQueryResponseWriter("xlsx");
+    } else {
+      throw new Exception("XLSXResponseWriter not registered with solr core");
+    }
+  }
+
+  public static void createIndex() {
+    assertU(adoc("id","1", "foo_i","-1", "foo_s","hi", "foo_l","12345678987654321", "foo_b","false", "foo_f","1.414","foo_d","-1.0E300","foo_dt1","2000-01-02T03:04:05Z"));
+    assertU(adoc("id","2", "v_ss","hi",  "v_ss","there", "v2_ss","nice", "v2_ss","output", "shouldbeunstored","foo"));
+    assertU(adoc("id","3", "shouldbeunstored","foo"));
+    assertU(adoc("id","4", "foo_s1","foo"));
+    assertU(commit());
+  }
+
+  @AfterClass
+  public static void cleanupWriter() throws Exception {
+    writerXlsx = null;
+  }
+
+  @Test
+  public void testStructuredDataViaBaseWriters() throws IOException, Exception {
+    SolrQueryResponse rsp = new SolrQueryResponse();
+    // Don't send a ContentStream back, this will fall back to the configured base writer.
+    // But abuse the CONTENT key to ensure writer is also checking type
+    rsp.add(RawResponseWriter.CONTENT, "test");
+    rsp.add("foo", "bar");
+
+    SolrQueryRequest r = req();
+
+    // check Content-Type
+    assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", writerXlsx.getContentType(r, rsp));
+
+    // test our basic types,and that fields come back in the requested order
+    XSSFSheet resultSheet = getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1"));
+
+    assertEquals("id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1\n1,hi,-1,12345678987654321,F,1.414,-1.0E300,2000-01-02T03:04:05Z\n"
+        , getStringFromSheet(resultSheet));
+
+    resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "fl","id,score,foo_s"));
+    // test retrieving score
+    assertEquals("id,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
+
+    resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "colname.id", "I.D.", "colwidth.id", "10",
+                                      "fl","id,score,foo_s"));
+    // test override colname/width
+    assertEquals("I.D.,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
+    // test colwidth (value returned is in 256ths of a character as per excel standard)
+    assertEquals(10*256, resultSheet.getColumnWidth(0));
+
+    resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,v_ss"));
+    // test multivalued
+    assertEquals("id,v_ss\n2,hi; there\n", getStringFromSheet(resultSheet));
+
+    // test retrieving fields from index
+    resultSheet = getWSResultForQuery(req("q","*:*", "wt","xslx", "fl","*,score"));
+    String result = getStringFromSheet(resultSheet);
+    for (String field : "id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss,score".split(",")) {
+      assertTrue(result.indexOf(field) >= 0);
+    }
+
+    // test null values
+    resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,foo_s,v_ss"));
+    assertEquals("id,foo_s,v_ss\n2,,hi; there\n", getStringFromSheet(resultSheet));
+
+    // now test SolrDocumentList
+    SolrDocument d = new SolrDocument();
+    SolrDocument d1 = d;
+    d.addField("id","1");
+    d.addField("foo_i",-1);
+    d.addField("foo_s","hi");
+    d.addField("foo_l","12345678987654321L");
+    d.addField("foo_b",false);
+    d.addField("foo_f",1.414f);
+    d.addField("foo_d",-1.0E300);
+    d.addField("foo_dt1", new Date(Instant.parse("2000-01-02T03:04:05Z").toEpochMilli()));
+    d.addField("score", "2.718");
+
+    d = new SolrDocument();
+    SolrDocument d2 = d;
+    d.addField("id","2");
+    d.addField("v_ss","hi");
+    d.addField("v_ss","there");
+    d.addField("v2_ss","nice");
+    d.addField("v2_ss","output");
+    d.addField("score", "89.83");
+    d.addField("shouldbeunstored","foo");
+
+    SolrDocumentList sdl = new SolrDocumentList();
+    sdl.add(d1);
+    sdl.add(d2);
+    
+    SolrQueryRequest req = req("q","*:*");
+    rsp = new SolrQueryResponse();
+    rsp.addResponse(sdl);
+
+    rsp.setReturnFields( new SolrReturnFields("id,foo_s", req) );
+
+    resultSheet = getWSResultForQuery(req, rsp);
+    assertEquals("id,foo_s\n1,hi\n2,\n", getStringFromSheet(resultSheet));
+
+    // try scores
+    rsp.setReturnFields( new SolrReturnFields("id,score,foo_s", req) );
+
+    resultSheet = getWSResultForQuery(req, rsp);
+    assertEquals("id,score,foo_s\n1,2.718,hi\n2,89.83,\n", getStringFromSheet(resultSheet));
+
+    // get field values from docs... should be ordered and not include score unless requested
+    rsp.setReturnFields( new SolrReturnFields("*", req) );
+
+    resultSheet = getWSResultForQuery(req, rsp);
+    assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss\n" +
+        "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z,,\n" +
+        "2,,,,,,,,hi; there,nice; output\n", getStringFromSheet(resultSheet));
+
+    // get field values and scores - just check that the scores are there... we don't guarantee where
+    rsp.setReturnFields( new SolrReturnFields("*,score", req) );
+    resultSheet = getWSResultForQuery(req, rsp);
+    String s = getStringFromSheet(resultSheet);
+    assertTrue(s.indexOf("score") >=0 && s.indexOf("2.718") > 0 && s.indexOf("89.83") > 0 );
+    
+    // Test field globs
+    rsp.setReturnFields( new SolrReturnFields("id,foo*", req) );
+    resultSheet = getWSResultForQuery(req, rsp);
+    assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1\n" +
+        "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z\n" +
+        "2,,,,,,,\n", getStringFromSheet(resultSheet));
+
+    rsp.setReturnFields( new SolrReturnFields("id,*_d*", req) );
+    resultSheet = getWSResultForQuery(req, rsp);
+    assertEquals("id,foo_d,foo_dt1\n" +
+        "1,-1.0E300,2000-01-02T03:04:05Z\n" +
+        "2,,\n", getStringFromSheet(resultSheet));
+
+    // Test function queries
+    rsp.setReturnFields( new SolrReturnFields("sum(1,1),id,exists(foo_s1),div(9,1),foo_f", req) );
+    resultSheet = getWSResultForQuery(req, rsp);
+    assertEquals("sum(1,1),id,exists(foo_s1),div(9,1),foo_f\n" +
+        ",1,,,1.414\n" +
+        ",2,,,\n", getStringFromSheet(resultSheet));
+
+    // Test transformers
+    rsp.setReturnFields( new SolrReturnFields("mydocid:[docid],[explain]", req) );
+    resultSheet = getWSResultForQuery(req, rsp);
+    assertEquals("mydocid,[explain]\n" +
+        ",\n" +
+        ",\n", getStringFromSheet(resultSheet));
+
+    req.close();
+  }
+  
+
+  @Test
+  public void testPseudoFields() throws Exception {
+    // Use Pseudo Field
+    SolrQueryRequest req = req("q","id:1", "wt","xlsx", "fl","XXX:id,foo_s");
+    XSSFSheet resultSheet = getWSResultForQuery(req);
+    assertEquals("XXX,foo_s\n1,hi\n", getStringFromSheet(resultSheet));
+    
+    String txt = getStringFromSheet(getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","XXX:id,YYY:[docid],FOO:foo_s")));
+    String[] lines = txt.split("\n");
+    assertEquals(2, lines.length);
+    assertEquals("XXX,YYY,FOO", lines[0] );
+    assertEquals("1,0,hi", lines[1] );
+
+    //assertions specific to multiple pseudofields functions like abs, div, exists, etc.. (SOLR-5423)
+    String funcText = getStringFromSheet(getWSResultForQuery(req("df", "text", "q","*", "wt","xlsx", "fl","XXX:id,YYY:exists(foo_s1)")));
+    String[] funcLines = funcText.split("\n");
+    assertEquals(5, funcLines.length);
+    assertEquals("XXX,YYY", funcLines[0] );
+    assertEquals("1,false", funcLines[1] );
+    assertEquals("3,false", funcLines[3] );
+  }
+
+  // returns first worksheet as XLSXResponseWriter only returns one sheet
+  private XSSFSheet getWSResultForQuery(SolrQueryRequest req) throws IOException, Exception {
+    SolrQueryResponse rsp = h.queryAndResponse("", req);
+    return getWSResultForQuery(req, rsp);
+  }
+
+  private XSSFSheet getWSResultForQuery(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, Exception {
+    ByteArrayOutputStream xmlBout = new ByteArrayOutputStream();
+    writerXlsx.write(xmlBout, req, rsp);
+    XSSFWorkbook output = new XSSFWorkbook(new ByteArrayInputStream(xmlBout.toByteArray()));
+    XSSFSheet sheet = output.getSheetAt(0);
+    req.close();
+    output.close();
+    return sheet;
+  }
+
+  private String getStringFromSheet(XSSFSheet sheet) {
+    StringBuilder output = new StringBuilder();
+    for (Row row: sheet) {
+      for (Cell cell: row) {
+        output.append(cell.getStringCellValue());
+        output.append(",");
+      }
+      output.setLength(output.length() - 1);
+      output.append("\n");
+    }
+    return output.toString();
+  }
+}