You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/02 15:27:30 UTC
[3/5] lucene-solr:jira/gradle: Add :solr:contrib:extraction module
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
deleted file mode 100644
index 475c333..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
+++ /dev/null
@@ -1,484 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- The Solr schema file. This file should be named "schema.xml" and
- should be located where the classloader for the Solr webapp can find it.
-
- This schema is used for testing, and as such has everything and the
- kitchen sink thrown in. See example/solr/conf/schema.xml for a
- more concise example.
-
- -->
-
-<schema name="test" version="1.0">
-
-
- <!--
- Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
- -->
- <fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
- <fieldType name="float" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
- <fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
- <fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
-
- <!--
- Numeric field types that index each value at various levels of precision
- to accelerate range queries when the number of values between the range
- endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
- implementation details.
-
- Smaller precisionStep values (specified in bits) will lead to more tokens
- indexed per value, slightly larger index size, and faster range queries.
- A precisionStep of 0 disables indexing at different precision levels.
- -->
- <fieldType name="tint" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
- <fieldType name="tfloat" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
- <fieldType name="tlong" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
- <fieldType name="tdouble" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
-
- <!-- Field type demonstrating an Analyzer failure -->
- <fieldType name="failtype1" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
- catenateNumbers="0" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <!-- Demonstrating ignoreCaseChange -->
- <fieldType name="wdf_nocase" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
- catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
- catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="wdf_preserve" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
- catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
- catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
-
- <!-- HighlitText optimizes storage for (long) columns which will be highlit -->
- <fieldType name="highlittext" class="solr.TextField" compressThreshold="345"/>
-
- <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
- <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
-
- <!-- format for date is 1995-12-31T23:59:59.999Z and only the fractional
- seconds part (.999) is optional.
- -->
- <fieldType name="date" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" sortMissingLast="true" omitNorms="true"/>
-
- <!-- solr.TextField allows the specification of custom
- text analyzers specified as a tokenizer and a list
- of token filters.
- -->
- <fieldType name="text" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.ClassicTokenizerFactory"/>
- <filter class="solr.ClassicFilterFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
-
-
- <fieldType name="nametext" class="solr.TextField">
- <analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
- </fieldType>
-
- <!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="keywordtok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
- </analyzer>
- </fieldType>
- <fieldType name="standardtok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="lettertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LetterTokenizerFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="whitetok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="HTMLstandardtok" class="solr.TextField">
- <analyzer>
- <charFilter class="solr.HTMLStripCharFilterFactory"/>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="HTMLwhitetok" class="solr.TextField">
- <analyzer>
- <charFilter class="solr.HTMLStripCharFilterFactory"/>
- <tokenizer class="solr.MockTokenizerFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="standardtokfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.ClassicTokenizerFactory"/>
- <filter class="solr.ClassicFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="standardfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.ClassicFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="lowerfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="patternreplacefilt" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
- <filter class="solr.PatternReplaceFilterFactory"
- pattern="([^a-zA-Z])" replacement="_" replace="all"
- />
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
- </analyzer>
- </fieldType>
- <fieldType name="porterfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
- <!-- fieldType name="snowballfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.SnowballPorterFilterFactory"/>
- </analyzer>
- </fieldType -->
- <fieldType name="engporterfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="custengporterfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="stopfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true"/>
- </analyzer>
- </fieldType>
- <fieldType name="custstopfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
- </analyzer>
- </fieldType>
- <fieldType name="lengthfilt" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.LengthFilterFactory" min="2" max="5"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
- catenateNumbers="1" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory"/>
- <filter class="solr.PorterStemFilterFactory"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
- catenateNumbers="0" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <!-- more flexible in matching skus, but more chance of a false match -->
- <fieldType name="skutype1" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
- catenateNumbers="1" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
- catenateNumbers="1" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <!-- less flexible in matching skus, but less chance of a false match -->
- <fieldType name="skutype2" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
- catenateNumbers="1" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
- catenateNumbers="1" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <!-- less flexible in matching skus, but less chance of a false match -->
- <fieldType name="syn" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
- </analyzer>
- </fieldType>
-
- <!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
- synonyms "better"
- -->
- <fieldType name="dedup" class="solr.TextField">
- <analyzer type="index">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.SynonymGraphFilterFactory"
- synonyms="synonyms.txt" expand="true"/>
- <filter class="solr.PorterStemFilterFactory"/>
- <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
- <filter class="solr.FlattenGraphFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.SynonymGraphFilterFactory"
- synonyms="synonyms.txt" expand="true"/>
- <filter class="solr.PorterStemFilterFactory"/>
- <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="unstored" class="solr.StrField" indexed="true" stored="false"/>
-
-
- <fieldType name="textgap" class="solr.TextField" multiValued="true" positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
-
- <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
- <field name="name" type="nametext" indexed="true" stored="true"/>
- <field name="text" type="text" indexed="true" stored="false"/>
- <field name="subject" type="text" indexed="true" stored="true"/>
- <field name="title" type="nametext" indexed="true" stored="true"/>
- <field name="weight" type="float" indexed="true" stored="true"/>
- <field name="bday" type="date" indexed="true" stored="true"/>
-
- <field name="title_stemmed" type="text" indexed="true" stored="false"/>
- <field name="title_lettertok" type="lettertok" indexed="true" stored="false"/>
-
- <field name="syn" type="syn" indexed="true" stored="true"/>
-
- <!-- to test property inheritance and overriding -->
- <field name="shouldbeunstored" type="unstored"/>
- <field name="shouldbestored" type="unstored" stored="true"/>
- <field name="shouldbeunindexed" type="unstored" indexed="false" stored="true"/>
-
-
- <!-- test different combinations of indexed and stored -->
- <field name="bind" type="boolean" indexed="true" stored="false"/>
- <field name="bsto" type="boolean" indexed="false" stored="true"/>
- <field name="bindsto" type="boolean" indexed="true" stored="true"/>
- <field name="isto" type="int" indexed="false" stored="true"/>
- <field name="iind" type="int" indexed="true" stored="false"/>
- <field name="ssto" type="string" indexed="false" stored="true"/>
- <field name="sind" type="string" indexed="true" stored="false"/>
- <field name="sindsto" type="string" indexed="true" stored="true"/>
-
- <!-- test combinations of term vector settings -->
- <field name="test_basictv" type="text" termVectors="true"/>
- <field name="test_notv" type="text" termVectors="false"/>
- <field name="test_postv" type="text" termVectors="true" termPositions="true"/>
- <field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
- <field name="test_posofftv" type="text" termVectors="true"
- termPositions="true" termOffsets="true"/>
-
- <!-- test highlit field settings -->
- <field name="test_hlt" type="highlittext" indexed="true"/>
- <field name="test_hlt_off" type="highlittext" indexed="true"/>
-
- <!-- fields to test individual tokenizers and tokenfilters -->
- <field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
- <field name="standardtok" type="standardtok" indexed="true" stored="true"/>
- <field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
- <field name="lettertok" type="lettertok" indexed="true" stored="true"/>
- <field name="whitetok" type="whitetok" indexed="true" stored="true"/>
- <field name="HTMLwhitetok" type="HTMLwhitetok" indexed="true" stored="true"/>
- <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
- <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
- <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
- <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
- <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
- <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
- <field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>
- <field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
- <field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
- <field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
- <field name="dedup" type="dedup" indexed="true" stored="true"/>
- <field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
- <field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
-
- <field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>
-
- <field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
-
- <field name="subword" type="subword" indexed="true" stored="true"/>
- <field name="sku1" type="skutype1" indexed="true" stored="true"/>
- <field name="sku2" type="skutype2" indexed="true" stored="true"/>
-
- <field name="textgap" type="textgap" indexed="true" stored="true"/>
-
- <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
- <field name="multiDefault" type="string" indexed="true" stored="true" default="muLti-Default" multiValued="true"/>
- <field name="intDefault" type="int" indexed="true" stored="true" default="42" multiValued="false"/>
-
- <field name="extractedDate" type="date" indexed="true" stored="true" multiValued="true"/>
- <field name="extractedContent" type="text" indexed="true" stored="true" multiValued="true"/>
- <field name="extractedProducer" type="text" indexed="true" stored="true" multiValued="true"/>
- <field name="extractedCreator" type="text" indexed="true" stored="true" multiValued="true"/>
- <field name="extractedKeywords" type="text" indexed="true" stored="true" multiValued="true"/>
- <field name="extractedAuthor" type="text" indexed="true" stored="true" multiValued="true"/>
- <field name="extractedLanguage" type="string" indexed="true" stored="true" multiValued="true"/>
- <field name="resourceName" type="string" indexed="true" stored="true" multiValued="true"/>
-
- <field name="extractionLiteralMV" type="string" indexed="true" stored="true" multiValued="true"/>
- <field name="extractionLiteral" type="string" indexed="true" stored="true" multiValued="false"/>
-
- <field name="defaultExtr" type="string" indexed="true" stored="false"/>
-
- <!-- Dynamic field definitions. If a field name is not found, dynamicFields
- will be used if the name matches any of the patterns.
- RESTRICTION: the glob-like pattern in the name attribute must have
- a "*" only at the start or the end.
- EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
- Longer patterns will be matched first. if equal size patterns
- both match, the first appearing in the schema will be used.
- -->
- <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
- <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
- <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
- <dynamicField name="*_s1" type="string" indexed="true" stored="true" multiValued="false"/>
- <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
- <dynamicField name="*_t" type="text" indexed="true" stored="true"/>
- <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
- <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
- <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
- <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
- <dynamicField name="*_dt1" type="date" indexed="true" stored="true" multiValued="false"/>
-
- <dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
- <dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
- <dynamicField name="t_*" type="text" indexed="true" stored="true"/>
- <dynamicField name="tv_*" type="text" indexed="true" stored="true"
- termVectors="true" termPositions="true" termOffsets="true"/>
-
- <dynamicField name="stream_*" type="text" indexed="true" stored="true"/>
- <dynamicField name="Content*" type="text" indexed="true" stored="true"/>
-
-
- <!-- special fields for dynamic copyField test -->
- <dynamicField name="dynamic_*" type="string" indexed="true" stored="true"/>
- <dynamicField name="*_dynamic" type="string" indexed="true" stored="true"/>
-
- <!-- for testing to ensure that longer patterns are matched first -->
- <dynamicField name="*aa" type="string" indexed="true" stored="true"/>
- <dynamicField name="*aaa" type="int" indexed="false" stored="true"/>
-
- <!-- ignored because not stored or indexed -->
- <dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
-
-
- <uniqueKey>id</uniqueKey>
-
- <!-- copyField commands copy one field to another at the time a document
- is added to the index. It's used either to index the same field different
- ways, or to add multiple fields to the same field for easier/faster searching.
- -->
- <copyField source="title" dest="title_stemmed"/>
- <copyField source="title" dest="title_lettertok"/>
-
- <copyField source="title" dest="text"/>
- <copyField source="subject" dest="text"/>
-
- <copyField source="*_t" dest="text"/>
-
- <!-- dynamic destination -->
- <copyField source="*_dynamic" dest="dynamic_*"/>
-
-
-</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
deleted file mode 100644
index aef03af..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
+++ /dev/null
@@ -1,214 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<config>
- <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
- <jmx />
- <indexConfig>
- <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
- </indexConfig>
-
- <!-- Used to specify an alternate directory to hold all index data.
- It defaults to "index" if not present, and should probably
- not be changed if replication is in use. -->
- <dataDir>${solr.data.dir:}</dataDir>
- <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
- <schemaFactory class="ClassicIndexSchemaFactory"/>
-
- <updateHandler class="solr.DirectUpdateHandler2">
-
- <!-- autocommit pending docs if certain criteria are met
- <autoCommit>
- <maxDocs>10000</maxDocs>
- <maxTime>3600000</maxTime>
- </autoCommit>
- -->
-
- </updateHandler>
-
-
- <updateProcessor class="solr.ParseDateFieldUpdateProcessorFactory" name="parse-date">
- <arr name="format">
- <str>yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z</str>
- <str>yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z</str>
- <str>yyyy-MM-dd HH:mm[:ss[.SSS]][z</str>
- <str>yyyy-MM-dd HH:mm[:ss[,SSS]][z</str>
- <str>[EEE, ]dd MMM yyyy HH:mm[:ss] z</str>
- <str>EEEE, dd-MMM-yy HH:mm:ss z</str>
- <str>EEE MMM ppd HH:mm:ss [z ]yyyy</str>
- </arr>
- </updateProcessor>
-
- <!-- The update.autoCreateFields property can be turned to false to disable schemaless mode -->
- <updateRequestProcessorChain name="parse-date" default="true"
- processor="parse-date">
- <processor class="solr.RunUpdateProcessorFactory"/>
- </updateRequestProcessorChain>
-
-
- <query>
- <!-- Maximum number of clauses in a boolean query... can affect
- range or wildcard queries that expand to big boolean
- queries. An exception is thrown if exceeded.
- -->
- <maxBooleanClauses>1024</maxBooleanClauses>
-
-
- <!-- Cache specification for Filters or DocSets - unordered set of *all* documents
- that match a particular query.
- -->
- <filterCache
- class="solr.search.LRUCache"
- size="512"
- initialSize="512"
- autowarmCount="256"/>
-
- <queryResultCache
- class="solr.search.LRUCache"
- size="512"
- initialSize="512"
- autowarmCount="1024"/>
-
- <documentCache
- class="solr.search.LRUCache"
- size="512"
- initialSize="512"
- autowarmCount="0"/>
-
- <!-- If true, stored fields that are not requested will be loaded lazily.
- -->
- <enableLazyFieldLoading>true</enableLazyFieldLoading>
-
- <!--
-
- <cache name="myUserCache"
- class="solr.search.LRUCache"
- size="4096"
- initialSize="1024"
- autowarmCount="1024"
- regenerator="MyRegenerator"
- />
- -->
-
-
- <useFilterForSortedQuery>true</useFilterForSortedQuery>
-
- <queryResultWindowSize>10</queryResultWindowSize>
-
- <!-- set maxSize artificially low to exercise both types of sets -->
- <HashDocSet maxSize="3" loadFactor="0.75"/>
-
-
- <!-- boolToFilterOptimizer converts boolean clauses with zero boost
- into cached filters if the number of docs selected by the clause exceeds
- the threshold (represented as a fraction of the total index)
- -->
- <boolTofilterOptimizer enabled="false" cacheSize="32" threshold=".05"/>
-
-
- <!-- a newSearcher event is fired whenever a new searcher is being prepared
- and there is a current searcher handling requests (aka registered). -->
- <!-- QuerySenderListener takes an array of NamedList and executes a
- local query request for each NamedList in sequence. -->
- <!--
- <listener event="newSearcher" class="solr.QuerySenderListener">
- <arr name="queries">
- <lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
- <lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
- </arr>
- </listener>
- -->
-
- <!-- a firstSearcher event is fired whenever a new searcher is being
- prepared but there is no current registered searcher to handle
- requests or to gain prewarming data from. -->
- <!--
- <listener event="firstSearcher" class="solr.QuerySenderListener">
- <arr name="queries">
- <lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
- </arr>
- </listener>
- -->
-
-
- </query>
-
-
- <requestHandler name="/select" class="solr.SearchHandler">
- <bool name="httpCaching">true</bool>
- </requestHandler>
-
- <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
- <str name="parseContext.config">parseContext.xml</str>
- </requestHandler>
-
- <requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
- <lst name="defaults">
- <str name="literal.foo_s">x</str>
- </lst>
- <lst name="appends">
- <str name="literal.bar_s">y</str>
- </lst>
- <lst name="invariants">
- <str name="literal.zot_s">z</str>
- <str name="uprefix">ignored_</str>
- </lst>
- </requestHandler>
-
- <highlighting>
- <!-- Configure the standard fragmenter -->
- <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
- <lst name="defaults">
- <int name="hl.fragsize">100</int>
- </lst>
- </fragmenter>
-
- <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
- <lst name="defaults">
- <int name="hl.fragsize">70</int>
- </lst>
- </fragmenter>
-
- <!-- Configure the standard formatter -->
- <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
- <lst name="defaults">
- <str name="hl.simple.pre"><![CDATA[<em>]]></str>
- <str name="hl.simple.post"><![CDATA[</em>]]></str>
- </lst>
- </formatter>
- </highlighting>
-
-
- <requestDispatcher>
- <requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="-1" />
- <httpCaching lastModifiedFrom="openTime" etagSeed="Solr" never304="false">
- <cacheControl>max-age=30, public</cacheControl>
- </httpCaching>
- </requestDispatcher>
-
- <admin>
- <defaultQuery>solr</defaultQuery>
- <gettableFiles>solrconfig.xml schema.xml</gettableFiles>
- </admin>
-
- <!-- test getting system property -->
- <propTest attr1="${solr.test.sys.prop1}-$${literal}"
- attr2="${non.existent.sys.prop:default-from-config}">prefix-${solr.test.sys.prop2}-suffix</propTest>
-
-</config>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt
deleted file mode 100644
index 688e307..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/stopwords.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-stopworda
-stopwordb
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt
deleted file mode 100644
index a7624f0..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/synonyms.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-a => aa
-b => b1 b2
-c => c1,c2
-a\=>a => b\=>b
-a\,a => b\,b
-foo,bar,baz
-
-Television,TV,Televisions
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz b/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz
deleted file mode 100644
index f5df886..0000000
Binary files a/solr/contrib/extraction/src/test-files/extraction/tiny.txt.gz and /dev/null differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/version_control.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/version_control.txt b/solr/contrib/extraction/src/test-files/extraction/version_control.txt
deleted file mode 100644
index 7a89c5b..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/version_control.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-Solr Version Control System
-
-Overview
-
-The Solr source code resides in the Apache Subversion (SVN) repository.
-The command-line SVN client can be obtained here or as an optional package
-for cygwin.
-
-The TortoiseSVN GUI client for Windows can be obtained here. There
-are also SVN plugins available for older versions of Eclipse and
-IntelliJ IDEA that don't have subversion support already included.
-
--------------------------------
-
-Note: This document is an excerpt from a document Licensed to the
-Apache Software Foundation (ASF) under one or more contributor
-license agreements. See the XML version (version_control.xml) for
-more details.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/version_control.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/version_control.xml b/solr/contrib/extraction/src/test-files/extraction/version_control.xml
deleted file mode 100644
index 4e09960..0000000
--- a/solr/contrib/extraction/src/test-files/extraction/version_control.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0"?>
-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<document>
-
- <header>
- <title>Solr Version Control System</title>
- </header>
-
- <body>
-
- <section>
- <title>Overview</title>
- <p>
- The Solr source code resides in the Apache <a href="http://subversion.tigris.org/">Subversion (SVN)</a> repository.
- The command-line SVN client can be obtained <a href="http://subversion.tigris.org/project_packages.html">here</a> or as an optional package for <a href="http://www.cygwin.com/">cygwin</a>.
- The TortoiseSVN GUI client for Windows can be obtained <a href="http://tortoisesvn.tigris.org/">here</a>. There
- are also SVN plugins available for older versions of <a href="http://subclipse.tigris.org/">Eclipse</a> and
- <a href="http://svnup.tigris.org/">IntelliJ IDEA</a> that don't have subversion support already included.
- </p>
- </section>
- <p>Here is some more text. It contains <a href="http://lucene.apache.org">a link</a>. </p>
- <p>Text Here</p>
- </body>
-
-</document>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test-files/extraction/word2003.doc
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/word2003.doc b/solr/contrib/extraction/src/test-files/extraction/word2003.doc
deleted file mode 100644
index e55827b..0000000
Binary files a/solr/contrib/extraction/src/test-files/extraction/word2003.doc and /dev/null differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
new file mode 100644
index 0000000..132b371
--- /dev/null
+++ b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -0,0 +1,777 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.LocalSolrQueryRequest;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.processor.BufferingRequestProcessor;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+/**
+ *
+ *
+ **/
+public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ // Is the JDK/env affected by a known bug?
+ final String tzDisplayName = TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
+ if (!tzDisplayName.matches("[A-Z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) {
+ assertTrue("Is some other JVM affected? Or bad regex? TzDisplayName: " + tzDisplayName,
+ System.getProperty("java.version").startsWith("11"));
+ assumeTrue("SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in extracting dates in a bad format.", false);
+ }
+
+ initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr").getAbsolutePath());
+ }
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ clearIndex();
+ assertU(commit());
+ }
+
+ @Test
+ public void testExtraction() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+ loadLocal("extraction/solr-word.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "extractedContent",
+ "literal.id", "one",
+ "fmap.Last-Modified", "extractedDate"
+ );
+ assertQ(req("title:solr-word"), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("title:solr-word"), "//*[@numFound='1']");
+
+
+ loadLocal("extraction/simple.html", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "fmap.language", "extractedLanguage",
+ "literal.id", "two",
+ "uprefix", "ignored_",
+ "fmap.content", "extractedContent",
+ "fmap.Last-Modified", "extractedDate"
+ );
+ assertQ(req("title:Welcome"), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("title:Welcome"), "//*[@numFound='1']");
+
+ assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']");
+ assertQ(req("extractedContent:distinct"), "//*[@numFound='1']");
+ assertQ(req("extractedContent:words"), "//*[@numFound='2']");
+ assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");
+
+ loadLocal("extraction/simple.html",
+ "literal.id","simple2",
+ "uprefix", "t_",
+ "lowernames", "true",
+ "captureAttr", "true",
+ "fmap.a","t_href",
+ "fmap.content_type", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping
+ "commit", "true" // test immediate commit
+ );
+
+ // test that purposely causes a failure to print out the doc for test debugging
+ // assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
+
+ // test both lowernames and unknown field mapping
+ //assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
+ assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
+ assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
+ assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
+ assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded
+
+ // make sure the fact there is an index-time boost does not fail the parsing
+ loadLocal("extraction/simple.html",
+ "literal.id","simple3",
+ "uprefix", "t_",
+ "lowernames", "true",
+ "captureAttr", "true", "fmap.a","t_href",
+ "commit", "true"
+
+ ,"boost.t_href", "100.0"
+ );
+
+ assertQ(req("t_href:http"), "//*[@numFound='2']");
+ assertQ(req("t_href:http"), "//doc[2]/str[.='simple3']");
+ assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix
+
+ loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "three",
+ "uprefix", "ignored_",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Last-Modified", "extractedDate"
+ );
+ assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
+
+ loadLocal("extraction/word2003.doc", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "four",
+ "uprefix", "ignored_",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Last-Modified", "extractedDate"
+ );
+ assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='0']");
+ // There is already a PDF file with this content:
+ assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='1']");
+ assertU(commit());
+ assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='1']");
+ // now 2 of them:
+ assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='2']");
+
+ // compressed file
+ loadLocal("extraction/tiny.txt.gz",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "uprefix", "ignored_",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Last-Modified", "extractedDate",
+ "literal.id", "tiny.txt.gz");
+ assertU(commit());
+ assertQ(req("id:tiny.txt.gz")
+ , "//*[@numFound='1']"
+ , "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']"
+ );
+
+ // compressed file
+ loadLocal("extraction/open-document.odt",
+ "uprefix", "ignored_",
+ "fmap.content", "extractedContent",
+ "literal.id", "open-document");
+ assertU(commit());
+ assertQ(req("extractedContent:\"Práctica sobre GnuPG\"")
+ , "//*[@numFound='1']"
+ , "//*/arr[@name='stream_name']/str[.='open-document.odt']"
+ );
+ }
+
+ @Test
+ public void testCapture() throws Exception {
+ loadLocal("extraction/simple.html",
+ "literal.id","capture1",
+ "uprefix","t_",
+ "capture","div",
+ "fmap.div", "foo_t",
+ "commit", "true"
+ );
+ assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
+ assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
+
+ loadLocal("extraction/simple.html",
+ "literal.id", "capture2",
+ "captureAttr", "true",
+ "defaultField", "text",
+ "fmap.div", "div_t",
+ "fmap.a", "anchor_t",
+ "capture", "div",
+ "capture", "a",
+ "commit", "true"
+ );
+ assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
+ assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
+ assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
+ assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
+ }
+
+ @Test
+ public void testDefaultField() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+ try {
+ ignoreException("unknown field 'a'");
+ ignoreException("unknown field 'meta'"); // TODO: should this exception be happening?
+ loadLocal("extraction/simple.html",
+ "literal.id","simple2",
+ "lowernames", "true",
+ "captureAttr", "true",
+ //"fmap.content_type", "abcxyz",
+ "commit", "true" // test immediate commit
+ );
+ fail("Should throw SolrException");
+ } catch (SolrException e) {
+ //do nothing
+ } finally {
+ resetExceptionIgnores();
+ }
+
+
+ loadLocal("extraction/simple.html",
+ "literal.id","simple2",
+ ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
+ "lowernames", "true",
+ "captureAttr", "true",
+ //"fmap.content_type", "abcxyz",
+ "commit", "true" // test immediate commit
+ );
+ assertQ(req("id:simple2"), "//*[@numFound='1']");
+ assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"), "//*[@numFound='1']");
+
+ //Test when both uprefix and default are specified.
+ loadLocal("extraction/simple.html",
+ "literal.id","simple2",
+ ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
+ ExtractingParams.UNKNOWN_FIELD_PREFIX, "t_",
+ "lowernames", "true",
+ "captureAttr", "true",
+ "fmap.a","t_href",
+ //"fmap.content_type", "abcxyz",
+ "commit", "true" // test immediate commit
+ );
+ assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
+ }
+
+ @Test
+ public void testLiterals() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+ //test literal
+ loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "extractedContent",
+ "literal.id", "one",
+ "uprefix", "ignored_",
+ "fmap.language", "extractedLanguage",
+ "literal.extractionLiteralMV", "one",
+ "literal.extractionLiteralMV", "two",
+ "fmap.Last-Modified", "extractedDate"
+
+ );
+ assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
+
+ assertQ(req("extractionLiteralMV:one"), "//*[@numFound='1']");
+ assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']");
+
+ try {
+ loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "extractedContent",
+ "literal.id", "two",
+ "fmap.language", "extractedLanguage",
+ "literal.extractionLiteral", "one",
+ "literal.extractionLiteral", "two",
+ "fmap.X-Parsed-By", "ignored_parser",
+ "fmap.Last-Modified", "extractedDate"
+ );
+ // TODO: original author did not specify why an exception should be thrown... how to fix?
+ // assertTrue("Exception should have been thrown", false);
+ } catch (SolrException e) {
+ //nothing to see here, move along
+ }
+
+ loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "extractedContent",
+ "literal.id", "three",
+ "fmap.language", "extractedLanguage",
+ "literal.extractionLiteral", "one",
+ "fmap.X-Parsed-By", "ignored_parser",
+ "fmap.Last-Modified", "extractedDate"
+ );
+ assertU(commit());
+ assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
+
+ }
+
+ public void testLiteralDefaults() throws Exception {
+
+ // sanity check config
+ loadLocalFromHandler("/update/extract/lit-def",
+ "extraction/simple.html",
+ "literal.id", "lit-def-simple");
+ assertU(commit());
+ assertQ(req("q", "id:lit-def-simple")
+ , "//*[@numFound='1']"
+ , "count(//arr[@name='foo_s']/str)=1"
+ , "//arr[@name='foo_s']/str[.='x']"
+ , "count(//arr[@name='bar_s']/str)=1"
+ , "//arr[@name='bar_s']/str[.='y']"
+ , "count(//arr[@name='zot_s']/str)=1"
+ , "//arr[@name='zot_s']/str[.='z']"
+ );
+
+ // override the default foo_s
+ loadLocalFromHandler("/update/extract/lit-def",
+ "extraction/simple.html",
+ "literal.foo_s", "1111",
+ "literal.id", "lit-def-simple");
+ assertU(commit());
+ assertQ(req("q", "id:lit-def-simple")
+ , "//*[@numFound='1']"
+ , "count(//arr[@name='foo_s']/str)=1"
+ , "//arr[@name='foo_s']/str[.='1111']"
+ , "count(//arr[@name='bar_s']/str)=1"
+ , "//arr[@name='bar_s']/str[.='y']"
+ , "count(//arr[@name='zot_s']/str)=1"
+ , "//arr[@name='zot_s']/str[.='z']"
+ );
+
+ // pre-pend the bar_s
+ loadLocalFromHandler("/update/extract/lit-def",
+ "extraction/simple.html",
+ "literal.bar_s", "2222",
+ "literal.id", "lit-def-simple");
+ assertU(commit());
+ assertQ(req("q", "id:lit-def-simple")
+ , "//*[@numFound='1']"
+ , "count(//arr[@name='foo_s']/str)=1"
+ , "//arr[@name='foo_s']/str[.='x']"
+ , "count(//arr[@name='bar_s']/str)=2"
+ , "//arr[@name='bar_s']/str[.='2222']"
+ , "//arr[@name='bar_s']/str[.='y']"
+ , "count(//arr[@name='zot_s']/str)=1"
+ , "//arr[@name='zot_s']/str[.='z']"
+ );
+
+ // invariant zot_s can not be changed
+ loadLocalFromHandler("/update/extract/lit-def",
+ "extraction/simple.html",
+ "literal.zot_s", "3333",
+ "literal.id", "lit-def-simple");
+ assertU(commit());
+ assertQ(req("q", "id:lit-def-simple")
+ , "//*[@numFound='1']"
+ , "count(//arr[@name='foo_s']/str)=1"
+ , "//arr[@name='foo_s']/str[.='x']"
+ , "count(//arr[@name='bar_s']/str)=1"
+ , "//arr[@name='bar_s']/str[.='y']"
+ , "count(//arr[@name='zot_s']/str)=1"
+ , "//arr[@name='zot_s']/str[.='z']"
+ );
+
+ }
+
+ @Test
+ public void testPlainTextSpecifyingMimeType() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ // Load plain text specifying MIME type:
+ loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "one",
+ "fmap.language", "extractedLanguage",
+ "fmap.X-Parsed-By", "ignored_parser",
+ "fmap.content", "extractedContent",
+ ExtractingParams.STREAM_TYPE, "text/plain"
+ );
+ assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
+ }
+
+ @Test
+ public void testPlainTextSpecifyingResourceName() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ // Load plain text specifying filename
+ loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "one",
+ "fmap.language", "extractedLanguage",
+ "fmap.X-Parsed-By", "ignored_parser",
+ "fmap.content", "extractedContent",
+ ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt"
+ );
+ assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
+ }
+
+ @Test
+ public void testCommitWithin() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ SolrQueryRequest req = req("literal.id", "one",
+ ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt",
+ "commitWithin", "200"
+ );
+ SolrQueryResponse rsp = new SolrQueryResponse();
+ BufferingRequestProcessor p = new BufferingRequestProcessor(null);
+
+ ExtractingDocumentLoader loader = (ExtractingDocumentLoader) handler.newLoader(req, p);
+ loader.load(req, rsp, new ContentStreamBase.FileStream(getFile("extraction/version_control.txt")),p);
+
+ AddUpdateCommand add = p.addCommands.get(0);
+ assertEquals(200, add.commitWithin);
+
+ req.close();
+ }
+
+ // Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's
+ // automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser
+
+ @Test
+ public void testExtractOnly() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+ SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true");
+ assertTrue("rsp is null and it shouldn't be", rsp != null);
+ NamedList list = rsp.getValues();
+
+ String extraction = (String) list.get("solr-word.pdf");
+ assertTrue("extraction is null and it shouldn't be", extraction != null);
+ assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
+
+ NamedList nl = (NamedList) list.get("solr-word.pdf_metadata");
+ assertTrue("nl is null and it shouldn't be", nl != null);
+ Object title = nl.get("title");
+ assertTrue("title is null and it shouldn't be", title != null);
+ assertTrue(extraction.indexOf("<?xml") != -1);
+
+ rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
+ ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
+ assertTrue("rsp is null and it shouldn't be", rsp != null);
+ list = rsp.getValues();
+
+ extraction = (String) list.get("solr-word.pdf");
+ assertTrue("extraction is null and it shouldn't be", extraction != null);
+ assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
+ assertTrue(extraction.indexOf("<?xml") == -1);
+
+ nl = (NamedList) list.get("solr-word.pdf_metadata");
+ assertTrue("nl is null and it shouldn't be", nl != null);
+ title = nl.get("title");
+ assertTrue("title is null and it shouldn't be", title != null);
+
+
+
+ }
+
+ @Test
+ public void testXPath() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+ SolrQueryResponse rsp = loadLocal("extraction/example.html",
+ ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
+ ExtractingParams.EXTRACT_ONLY, "true"
+ );
+ assertTrue("rsp is null and it shouldn't be", rsp != null);
+ NamedList list = rsp.getValues();
+ String val = (String) list.get("example.html");
+ assertEquals("News", val.trim()); //there is only one matching <a> tag
+
+ loadLocal("extraction/example.html",
+ "literal.id", "example1",
+ "captureAttr", "true",
+ "defaultField", "text",
+ "capture", "div",
+ "fmap.div", "foo_t",
+ "boost.foo_t", "3",
+ "xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
+ "commit", "true"
+ );
+ assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
+ }
+
+ /** test arabic PDF extraction is functional */
+ @Test
+ public void testArabicPDF() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler)
+ h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.Author", "extractedAuthor",
+ "uprefix", "ignored_",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "one",
+ "fmap.Last-Modified", "extractedDate");
+ assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
+ assertU(commit());
+ assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
+ }
+
+ @Test
+ public void testTikaExceptionHandling() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler)
+ h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ try{
+ loadLocal("extraction/password-is-solrcell.docx",
+ "literal.id", "one");
+ fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
+ }
+ catch(Exception expected){}
+ assertU(commit());
+ assertQ(req("*:*"), "//result[@numFound=0]");
+
+ try{
+ loadLocal("extraction/password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "one",
+ "ignoreTikaException", "true", // set ignore flag
+ "fmap.Last-Modified", "extractedDate");
+ }
+ catch(Exception e){
+ fail("TikaException should be ignored.");
+ }
+ assertU(commit());
+ assertQ(req("*:*"), "//result[@numFound=1]");
+ }
+
+ @Test
+ public void testWrongStreamType() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ try{
+ // Load plain text specifying another mime type, should fail
+ loadLocal("extraction/version_control.txt",
+ "literal.id", "one",
+ ExtractingParams.STREAM_TYPE, "application/pdf"
+ );
+ fail("SolrException is expected because wrong parser specified for the file type");
+ }
+ catch(Exception expected){}
+
+ try{
+ // Load plain text specifying non existing mimetype, should fail
+ loadLocal("extraction/version_control.txt",
+ "literal.id", "one",
+ ExtractingParams.STREAM_TYPE, "foo/bar"
+ );
+ fail("SolrException is expected because nonexsisting parser specified");
+ }
+ catch(Exception expected){}
+ }
+
+ public void testLiteralsOverride() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ assertQ(req("*:*"), "//*[@numFound='0']");
+
+ // Here Tika should parse out a title for this document:
+ loadLocal("extraction/solr-word.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "three",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Last-Modified", "extractedDate");
+
+ // Here the literal value should override the Tika-parsed title:
+ loadLocal("extraction/solr-word.pdf",
+ "literal.title", "wolf-man",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "four",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Last-Modified", "extractedDate");
+
+ // Here we mimic the old behaviour where literals are added, not overridden
+ loadLocal("extraction/solr-word.pdf",
+ "literalsOverride", "false",
+ // Trick - we first map the metadata-title to an ignored field before we replace with literal title
+ "fmap.title", "ignored_a",
+ "literal.title", "old-behaviour",
+ "literal.extractedKeywords", "literalkeyword",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "five",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Last-Modified", "extractedDate");
+
+ assertU(commit());
+
+ assertQ(req("title:solr-word"), "//*[@numFound='1']");
+ assertQ(req("title:wolf-man"), "//*[@numFound='1']");
+ assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
+ }
+
+ @Test
+ public void testPdfWithImages() throws Exception {
+ //Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
+ loadLocal("extraction/pdf-with-image.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "pdfWithImage",
+ "resource.name", "pdf-with-image.pdf",
+ "resource.password", "solrRules",
+ "fmap.Last-Modified", "extractedDate");
+
+ assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
+ }
+
+ @Test
+ public void testPasswordProtected() throws Exception {
+ // PDF, Passwords from resource.password
+ loadLocal("extraction/encrypted-password-is-solrRules.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "pdfpwliteral",
+ "resource.name", "encrypted-password-is-solrRules.pdf",
+ "resource.password", "solrRules",
+ "fmap.Last-Modified", "extractedDate");
+
+ // PDF, Passwords from passwords property file
+ loadLocal("extraction/encrypted-password-is-solrRules.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "pdfpwfile",
+ "resource.name", "encrypted-password-is-solrRules.pdf",
+ "passwordsFile", "passwordRegex.properties", // Passwords-file
+ "fmap.Last-Modified", "extractedDate");
+
+ // DOCX, Explicit password
+ loadLocal("extraction/password-is-Word2010.docx",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "uprefix", "ignored_",
+ "literal.id", "docxpwliteral",
+ "resource.name", "password-is-Word2010.docx",
+ "resource.password", "Word2010", // Explicit password
+ "fmap.Last-Modified", "extractedDate");
+
+ // DOCX, Passwords from file
+ loadLocal("extraction/password-is-Word2010.docx",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "docxpwfile",
+ "resource.name", "password-is-Word2010.docx",
+ "passwordsFile", "passwordRegex.properties", // Passwords-file
+ "fmap.Last-Modified", "extractedDate");
+
+ assertU(commit());
+ Thread.sleep(100);
+ assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
+ assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
+ }
+
+ SolrQueryResponse loadLocalFromHandler(String handler, String filename,
+ String... args) throws Exception {
+
+ LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
+ try {
+ // TODO: stop using locally defined streams once stream.file and
+ // stream.body work everywhere
+ List<ContentStream> cs = new ArrayList<>();
+ cs.add(new ContentStreamBase.FileStream(getFile(filename)));
+ req.setContentStreams(cs);
+ return h.queryAndResponse(handler, req);
+ } finally {
+ req.close();
+ }
+ }
+
+ SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
+ return loadLocalFromHandler("/update/extract", filename, args);
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java
new file mode 100644
index 0000000..8aeeaad
--- /dev/null
+++ b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/ParseContextConfigTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.nio.file.Paths;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+public class ParseContextConfigTest extends SolrTestCaseJ4 {
+
+ public void testAll() throws Exception {
+ Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
+ Element entries = document.createElement("entries");
+ Element entry = document.createElement("entry");
+
+
+ entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
+ entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
+
+ Element property = document.createElement("property");
+
+ property.setAttribute("name", "extractInlineImages");
+ property.setAttribute("value", "true");
+ entry.appendChild(property);
+ entries.appendChild(entry);
+
+ ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader(Paths.get(".")), entries).create();
+
+ PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+
+ assertEquals(true, pdfParserConfig.getExtractInlineImages());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
new file mode 100644
index 0000000..7d37844
--- /dev/null
+++ b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.time.Instant;
+import java.util.Date;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.QueryResponseWriter;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.response.RawResponseWriter;
+import org.apache.solr.search.SolrReturnFields;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
+
+ private static XLSXResponseWriter writerXlsx;
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ System.setProperty("enable.update.log", "false");
+ initCore("solrconfig.xml","schema.xml",getFile("extraction/solr").getAbsolutePath());
+ createIndex();
+ //find a reference to the default response writer so we can redirect its output later
+ SolrCore testCore = h.getCore();
+ QueryResponseWriter writer = testCore.getQueryResponseWriter("xlsx");
+ if (writer instanceof XLSXResponseWriter) {
+ writerXlsx = (XLSXResponseWriter) testCore.getQueryResponseWriter("xlsx");
+ } else {
+ throw new Exception("XLSXResponseWriter not registered with solr core");
+ }
+ }
+
+ public static void createIndex() {
+ assertU(adoc("id","1", "foo_i","-1", "foo_s","hi", "foo_l","12345678987654321", "foo_b","false", "foo_f","1.414","foo_d","-1.0E300","foo_dt1","2000-01-02T03:04:05Z"));
+ assertU(adoc("id","2", "v_ss","hi", "v_ss","there", "v2_ss","nice", "v2_ss","output", "shouldbeunstored","foo"));
+ assertU(adoc("id","3", "shouldbeunstored","foo"));
+ assertU(adoc("id","4", "foo_s1","foo"));
+ assertU(commit());
+ }
+
+ @AfterClass
+ public static void cleanupWriter() throws Exception {
+ writerXlsx = null;
+ }
+
+ @Test
+ public void testStructuredDataViaBaseWriters() throws IOException, Exception {
+ SolrQueryResponse rsp = new SolrQueryResponse();
+ // Don't send a ContentStream back, this will fall back to the configured base writer.
+ // But abuse the CONTENT key to ensure writer is also checking type
+ rsp.add(RawResponseWriter.CONTENT, "test");
+ rsp.add("foo", "bar");
+
+ SolrQueryRequest r = req();
+
+ // check Content-Type
+ assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", writerXlsx.getContentType(r, rsp));
+
+ // test our basic types,and that fields come back in the requested order
+ XSSFSheet resultSheet = getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1"));
+
+ assertEquals("id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1\n1,hi,-1,12345678987654321,F,1.414,-1.0E300,2000-01-02T03:04:05Z\n"
+ , getStringFromSheet(resultSheet));
+
+ resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "fl","id,score,foo_s"));
+ // test retrieving score
+ assertEquals("id,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
+
+ resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "colname.id", "I.D.", "colwidth.id", "10",
+ "fl","id,score,foo_s"));
+ // test override colname/width
+ assertEquals("I.D.,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
+ // test colwidth (value returned is in 256ths of a character as per excel standard)
+ assertEquals(10*256, resultSheet.getColumnWidth(0));
+
+ resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,v_ss"));
+ // test multivalued
+ assertEquals("id,v_ss\n2,hi; there\n", getStringFromSheet(resultSheet));
+
+ // test retrieving fields from index
+ resultSheet = getWSResultForQuery(req("q","*:*", "wt","xslx", "fl","*,score"));
+ String result = getStringFromSheet(resultSheet);
+ for (String field : "id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss,score".split(",")) {
+ assertTrue(result.indexOf(field) >= 0);
+ }
+
+ // test null values
+ resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,foo_s,v_ss"));
+ assertEquals("id,foo_s,v_ss\n2,,hi; there\n", getStringFromSheet(resultSheet));
+
+ // now test SolrDocumentList
+ SolrDocument d = new SolrDocument();
+ SolrDocument d1 = d;
+ d.addField("id","1");
+ d.addField("foo_i",-1);
+ d.addField("foo_s","hi");
+ d.addField("foo_l","12345678987654321L");
+ d.addField("foo_b",false);
+ d.addField("foo_f",1.414f);
+ d.addField("foo_d",-1.0E300);
+ d.addField("foo_dt1", new Date(Instant.parse("2000-01-02T03:04:05Z").toEpochMilli()));
+ d.addField("score", "2.718");
+
+ d = new SolrDocument();
+ SolrDocument d2 = d;
+ d.addField("id","2");
+ d.addField("v_ss","hi");
+ d.addField("v_ss","there");
+ d.addField("v2_ss","nice");
+ d.addField("v2_ss","output");
+ d.addField("score", "89.83");
+ d.addField("shouldbeunstored","foo");
+
+ SolrDocumentList sdl = new SolrDocumentList();
+ sdl.add(d1);
+ sdl.add(d2);
+
+ SolrQueryRequest req = req("q","*:*");
+ rsp = new SolrQueryResponse();
+ rsp.addResponse(sdl);
+
+ rsp.setReturnFields( new SolrReturnFields("id,foo_s", req) );
+
+ resultSheet = getWSResultForQuery(req, rsp);
+ assertEquals("id,foo_s\n1,hi\n2,\n", getStringFromSheet(resultSheet));
+
+ // try scores
+ rsp.setReturnFields( new SolrReturnFields("id,score,foo_s", req) );
+
+ resultSheet = getWSResultForQuery(req, rsp);
+ assertEquals("id,score,foo_s\n1,2.718,hi\n2,89.83,\n", getStringFromSheet(resultSheet));
+
+ // get field values from docs... should be ordered and not include score unless requested
+ rsp.setReturnFields( new SolrReturnFields("*", req) );
+
+ resultSheet = getWSResultForQuery(req, rsp);
+ assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss\n" +
+ "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z,,\n" +
+ "2,,,,,,,,hi; there,nice; output\n", getStringFromSheet(resultSheet));
+
+ // get field values and scores - just check that the scores are there... we don't guarantee where
+ rsp.setReturnFields( new SolrReturnFields("*,score", req) );
+ resultSheet = getWSResultForQuery(req, rsp);
+ String s = getStringFromSheet(resultSheet);
+ assertTrue(s.indexOf("score") >=0 && s.indexOf("2.718") > 0 && s.indexOf("89.83") > 0 );
+
+ // Test field globs
+ rsp.setReturnFields( new SolrReturnFields("id,foo*", req) );
+ resultSheet = getWSResultForQuery(req, rsp);
+ assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1\n" +
+ "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z\n" +
+ "2,,,,,,,\n", getStringFromSheet(resultSheet));
+
+ rsp.setReturnFields( new SolrReturnFields("id,*_d*", req) );
+ resultSheet = getWSResultForQuery(req, rsp);
+ assertEquals("id,foo_d,foo_dt1\n" +
+ "1,-1.0E300,2000-01-02T03:04:05Z\n" +
+ "2,,\n", getStringFromSheet(resultSheet));
+
+ // Test function queries
+ rsp.setReturnFields( new SolrReturnFields("sum(1,1),id,exists(foo_s1),div(9,1),foo_f", req) );
+ resultSheet = getWSResultForQuery(req, rsp);
+ assertEquals("sum(1,1),id,exists(foo_s1),div(9,1),foo_f\n" +
+ ",1,,,1.414\n" +
+ ",2,,,\n", getStringFromSheet(resultSheet));
+
+ // Test transformers
+ rsp.setReturnFields( new SolrReturnFields("mydocid:[docid],[explain]", req) );
+ resultSheet = getWSResultForQuery(req, rsp);
+ assertEquals("mydocid,[explain]\n" +
+ ",\n" +
+ ",\n", getStringFromSheet(resultSheet));
+
+ req.close();
+ }
+
+
+ @Test
+ public void testPseudoFields() throws Exception {
+ // Use Pseudo Field
+ SolrQueryRequest req = req("q","id:1", "wt","xlsx", "fl","XXX:id,foo_s");
+ XSSFSheet resultSheet = getWSResultForQuery(req);
+ assertEquals("XXX,foo_s\n1,hi\n", getStringFromSheet(resultSheet));
+
+ String txt = getStringFromSheet(getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","XXX:id,YYY:[docid],FOO:foo_s")));
+ String[] lines = txt.split("\n");
+ assertEquals(2, lines.length);
+ assertEquals("XXX,YYY,FOO", lines[0] );
+ assertEquals("1,0,hi", lines[1] );
+
+ //assertions specific to multiple pseudofields functions like abs, div, exists, etc.. (SOLR-5423)
+ String funcText = getStringFromSheet(getWSResultForQuery(req("df", "text", "q","*", "wt","xlsx", "fl","XXX:id,YYY:exists(foo_s1)")));
+ String[] funcLines = funcText.split("\n");
+ assertEquals(5, funcLines.length);
+ assertEquals("XXX,YYY", funcLines[0] );
+ assertEquals("1,false", funcLines[1] );
+ assertEquals("3,false", funcLines[3] );
+ }
+
+ // returns first worksheet as XLSXResponseWriter only returns one sheet
+ private XSSFSheet getWSResultForQuery(SolrQueryRequest req) throws IOException, Exception {
+ SolrQueryResponse rsp = h.queryAndResponse("", req);
+ return getWSResultForQuery(req, rsp);
+ }
+
+ private XSSFSheet getWSResultForQuery(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, Exception {
+ ByteArrayOutputStream xmlBout = new ByteArrayOutputStream();
+ writerXlsx.write(xmlBout, req, rsp);
+ XSSFWorkbook output = new XSSFWorkbook(new ByteArrayInputStream(xmlBout.toByteArray()));
+ XSSFSheet sheet = output.getSheetAt(0);
+ req.close();
+ output.close();
+ return sheet;
+ }
+
+ private String getStringFromSheet(XSSFSheet sheet) {
+ StringBuilder output = new StringBuilder();
+ for (Row row: sheet) {
+ for (Cell cell: row) {
+ output.append(cell.getStringCellValue());
+ output.append(",");
+ }
+ output.setLength(output.length() - 1);
+ output.append("\n");
+ }
+ return output.toString();
+ }
+}