You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/02 15:27:29 UTC
[2/5] lucene-solr:jira/gradle: Add :solr:contrib:extraction module
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
deleted file mode 100644
index 132b371..0000000
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ /dev/null
@@ -1,777 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-import java.util.TimeZone;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.util.ContentStream;
-import org.apache.solr.common.util.ContentStreamBase;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.request.LocalSolrQueryRequest;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.SolrQueryResponse;
-import org.apache.solr.update.AddUpdateCommand;
-import org.apache.solr.update.processor.BufferingRequestProcessor;
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-
-/**
- *
- *
- **/
-public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
-
- @BeforeClass
- public static void beforeClass() throws Exception {
- // Is the JDK/env affected by a known bug?
- final String tzDisplayName = TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
- if (!tzDisplayName.matches("[A-Z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) {
- assertTrue("Is some other JVM affected? Or bad regex? TzDisplayName: " + tzDisplayName,
- System.getProperty("java.version").startsWith("11"));
- assumeTrue("SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in extracting dates in a bad format.", false);
- }
-
- initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr").getAbsolutePath());
- }
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- clearIndex();
- assertU(commit());
- }
-
- @Test
- public void testExtraction() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
- loadLocal("extraction/solr-word.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "one",
- "fmap.Last-Modified", "extractedDate"
- );
- assertQ(req("title:solr-word"), "//*[@numFound='0']");
- assertU(commit());
- assertQ(req("title:solr-word"), "//*[@numFound='1']");
-
-
- loadLocal("extraction/simple.html", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.language", "extractedLanguage",
- "literal.id", "two",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.Last-Modified", "extractedDate"
- );
- assertQ(req("title:Welcome"), "//*[@numFound='0']");
- assertU(commit());
- assertQ(req("title:Welcome"), "//*[@numFound='1']");
-
- assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']");
- assertQ(req("extractedContent:distinct"), "//*[@numFound='1']");
- assertQ(req("extractedContent:words"), "//*[@numFound='2']");
- assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");
-
- loadLocal("extraction/simple.html",
- "literal.id","simple2",
- "uprefix", "t_",
- "lowernames", "true",
- "captureAttr", "true",
- "fmap.a","t_href",
- "fmap.content_type", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping
- "commit", "true" // test immediate commit
- );
-
- // test that purposely causes a failure to print out the doc for test debugging
- // assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
-
- // test both lowernames and unknown field mapping
- //assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
- assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
- assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
- assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
- assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded
-
- // make sure the fact there is an index-time boost does not fail the parsing
- loadLocal("extraction/simple.html",
- "literal.id","simple3",
- "uprefix", "t_",
- "lowernames", "true",
- "captureAttr", "true", "fmap.a","t_href",
- "commit", "true"
-
- ,"boost.t_href", "100.0"
- );
-
- assertQ(req("t_href:http"), "//*[@numFound='2']");
- assertQ(req("t_href:http"), "//doc[2]/str[.='simple3']");
- assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix
-
- loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "three",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Last-Modified", "extractedDate"
- );
- assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
- assertU(commit());
- assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
-
- loadLocal("extraction/word2003.doc", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "four",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Last-Modified", "extractedDate"
- );
- assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='0']");
- // There is already a PDF file with this content:
- assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='1']");
- assertU(commit());
- assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='1']");
- // now 2 of them:
- assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='2']");
-
- // compressed file
- loadLocal("extraction/tiny.txt.gz",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Last-Modified", "extractedDate",
- "literal.id", "tiny.txt.gz");
- assertU(commit());
- assertQ(req("id:tiny.txt.gz")
- , "//*[@numFound='1']"
- , "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']"
- );
-
- // compressed file
- loadLocal("extraction/open-document.odt",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "literal.id", "open-document");
- assertU(commit());
- assertQ(req("extractedContent:\"Práctica sobre GnuPG\"")
- , "//*[@numFound='1']"
- , "//*/arr[@name='stream_name']/str[.='open-document.odt']"
- );
- }
-
- @Test
- public void testCapture() throws Exception {
- loadLocal("extraction/simple.html",
- "literal.id","capture1",
- "uprefix","t_",
- "capture","div",
- "fmap.div", "foo_t",
- "commit", "true"
- );
- assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
- assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
-
- loadLocal("extraction/simple.html",
- "literal.id", "capture2",
- "captureAttr", "true",
- "defaultField", "text",
- "fmap.div", "div_t",
- "fmap.a", "anchor_t",
- "capture", "div",
- "capture", "a",
- "commit", "true"
- );
- assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
- assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
- assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
- assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
- }
-
- @Test
- public void testDefaultField() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
- try {
- ignoreException("unknown field 'a'");
- ignoreException("unknown field 'meta'"); // TODO: should this exception be happening?
- loadLocal("extraction/simple.html",
- "literal.id","simple2",
- "lowernames", "true",
- "captureAttr", "true",
- //"fmap.content_type", "abcxyz",
- "commit", "true" // test immediate commit
- );
- fail("Should throw SolrException");
- } catch (SolrException e) {
- //do nothing
- } finally {
- resetExceptionIgnores();
- }
-
-
- loadLocal("extraction/simple.html",
- "literal.id","simple2",
- ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
- "lowernames", "true",
- "captureAttr", "true",
- //"fmap.content_type", "abcxyz",
- "commit", "true" // test immediate commit
- );
- assertQ(req("id:simple2"), "//*[@numFound='1']");
- assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"), "//*[@numFound='1']");
-
- //Test when both uprefix and default are specified.
- loadLocal("extraction/simple.html",
- "literal.id","simple2",
- ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
- ExtractingParams.UNKNOWN_FIELD_PREFIX, "t_",
- "lowernames", "true",
- "captureAttr", "true",
- "fmap.a","t_href",
- //"fmap.content_type", "abcxyz",
- "commit", "true" // test immediate commit
- );
- assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
- }
-
- @Test
- public void testLiterals() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
- //test literal
- loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "one",
- "uprefix", "ignored_",
- "fmap.language", "extractedLanguage",
- "literal.extractionLiteralMV", "one",
- "literal.extractionLiteralMV", "two",
- "fmap.Last-Modified", "extractedDate"
-
- );
- assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
- assertU(commit());
- assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
-
- assertQ(req("extractionLiteralMV:one"), "//*[@numFound='1']");
- assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']");
-
- try {
- loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "two",
- "fmap.language", "extractedLanguage",
- "literal.extractionLiteral", "one",
- "literal.extractionLiteral", "two",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.Last-Modified", "extractedDate"
- );
- // TODO: original author did not specify why an exception should be thrown... how to fix?
- // assertTrue("Exception should have been thrown", false);
- } catch (SolrException e) {
- //nothing to see here, move along
- }
-
- loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "three",
- "fmap.language", "extractedLanguage",
- "literal.extractionLiteral", "one",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.Last-Modified", "extractedDate"
- );
- assertU(commit());
- assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
-
- }
-
- public void testLiteralDefaults() throws Exception {
-
- // sanity check config
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.id", "lit-def-simple");
- assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='x']"
- , "count(//arr[@name='bar_s']/str)=1"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
-
- // override the default foo_s
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.foo_s", "1111",
- "literal.id", "lit-def-simple");
- assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='1111']"
- , "count(//arr[@name='bar_s']/str)=1"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
-
- // pre-pend the bar_s
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.bar_s", "2222",
- "literal.id", "lit-def-simple");
- assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='x']"
- , "count(//arr[@name='bar_s']/str)=2"
- , "//arr[@name='bar_s']/str[.='2222']"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
-
- // invariant zot_s can not be changed
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.zot_s", "3333",
- "literal.id", "lit-def-simple");
- assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='x']"
- , "count(//arr[@name='bar_s']/str)=1"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
-
- }
-
- @Test
- public void testPlainTextSpecifyingMimeType() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
-
- // Load plain text specifying MIME type:
- loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "one",
- "fmap.language", "extractedLanguage",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.content", "extractedContent",
- ExtractingParams.STREAM_TYPE, "text/plain"
- );
- assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
- assertU(commit());
- assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
- }
-
- @Test
- public void testPlainTextSpecifyingResourceName() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
-
- // Load plain text specifying filename
- loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "one",
- "fmap.language", "extractedLanguage",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.content", "extractedContent",
- ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt"
- );
- assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
- assertU(commit());
- assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
- }
-
- @Test
- public void testCommitWithin() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
-
- SolrQueryRequest req = req("literal.id", "one",
- ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt",
- "commitWithin", "200"
- );
- SolrQueryResponse rsp = new SolrQueryResponse();
- BufferingRequestProcessor p = new BufferingRequestProcessor(null);
-
- ExtractingDocumentLoader loader = (ExtractingDocumentLoader) handler.newLoader(req, p);
- loader.load(req, rsp, new ContentStreamBase.FileStream(getFile("extraction/version_control.txt")),p);
-
- AddUpdateCommand add = p.addCommands.get(0);
- assertEquals(200, add.commitWithin);
-
- req.close();
- }
-
- // Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's
- // automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser
-
- @Test
- public void testExtractOnly() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
- SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true");
- assertTrue("rsp is null and it shouldn't be", rsp != null);
- NamedList list = rsp.getValues();
-
- String extraction = (String) list.get("solr-word.pdf");
- assertTrue("extraction is null and it shouldn't be", extraction != null);
- assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
-
- NamedList nl = (NamedList) list.get("solr-word.pdf_metadata");
- assertTrue("nl is null and it shouldn't be", nl != null);
- Object title = nl.get("title");
- assertTrue("title is null and it shouldn't be", title != null);
- assertTrue(extraction.indexOf("<?xml") != -1);
-
- rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
- ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
- assertTrue("rsp is null and it shouldn't be", rsp != null);
- list = rsp.getValues();
-
- extraction = (String) list.get("solr-word.pdf");
- assertTrue("extraction is null and it shouldn't be", extraction != null);
- assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
- assertTrue(extraction.indexOf("<?xml") == -1);
-
- nl = (NamedList) list.get("solr-word.pdf_metadata");
- assertTrue("nl is null and it shouldn't be", nl != null);
- title = nl.get("title");
- assertTrue("title is null and it shouldn't be", title != null);
-
-
-
- }
-
- @Test
- public void testXPath() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
- SolrQueryResponse rsp = loadLocal("extraction/example.html",
- ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
- ExtractingParams.EXTRACT_ONLY, "true"
- );
- assertTrue("rsp is null and it shouldn't be", rsp != null);
- NamedList list = rsp.getValues();
- String val = (String) list.get("example.html");
- assertEquals("News", val.trim()); //there is only one matching <a> tag
-
- loadLocal("extraction/example.html",
- "literal.id", "example1",
- "captureAttr", "true",
- "defaultField", "text",
- "capture", "div",
- "fmap.div", "foo_t",
- "boost.foo_t", "3",
- "xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
- "commit", "true"
- );
- assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
- }
-
- /** test arabic PDF extraction is functional */
- @Test
- public void testArabicPDF() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
- h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
-
- loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "fmap.Author", "extractedAuthor",
- "uprefix", "ignored_",
- "fmap.content", "wdf_nocase",
- "literal.id", "one",
- "fmap.Last-Modified", "extractedDate");
- assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
- assertU(commit());
- assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
- }
-
- @Test
- public void testTikaExceptionHandling() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
- h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
-
- try{
- loadLocal("extraction/password-is-solrcell.docx",
- "literal.id", "one");
- fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
- }
- catch(Exception expected){}
- assertU(commit());
- assertQ(req("*:*"), "//result[@numFound=0]");
-
- try{
- loadLocal("extraction/password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "one",
- "ignoreTikaException", "true", // set ignore flag
- "fmap.Last-Modified", "extractedDate");
- }
- catch(Exception e){
- fail("TikaException should be ignored.");
- }
- assertU(commit());
- assertQ(req("*:*"), "//result[@numFound=1]");
- }
-
- @Test
- public void testWrongStreamType() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
-
- try{
- // Load plain text specifying another mime type, should fail
- loadLocal("extraction/version_control.txt",
- "literal.id", "one",
- ExtractingParams.STREAM_TYPE, "application/pdf"
- );
- fail("SolrException is expected because wrong parser specified for the file type");
- }
- catch(Exception expected){}
-
- try{
- // Load plain text specifying non existing mimetype, should fail
- loadLocal("extraction/version_control.txt",
- "literal.id", "one",
- ExtractingParams.STREAM_TYPE, "foo/bar"
- );
- fail("SolrException is expected because nonexsisting parser specified");
- }
- catch(Exception expected){}
- }
-
- public void testLiteralsOverride() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
- assertTrue("handler is null and it shouldn't be", handler != null);
-
- assertQ(req("*:*"), "//*[@numFound='0']");
-
- // Here Tika should parse out a title for this document:
- loadLocal("extraction/solr-word.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "three",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Last-Modified", "extractedDate");
-
- // Here the literal value should override the Tika-parsed title:
- loadLocal("extraction/solr-word.pdf",
- "literal.title", "wolf-man",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "four",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Last-Modified", "extractedDate");
-
- // Here we mimic the old behaviour where literals are added, not overridden
- loadLocal("extraction/solr-word.pdf",
- "literalsOverride", "false",
- // Trick - we first map the metadata-title to an ignored field before we replace with literal title
- "fmap.title", "ignored_a",
- "literal.title", "old-behaviour",
- "literal.extractedKeywords", "literalkeyword",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "five",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Last-Modified", "extractedDate");
-
- assertU(commit());
-
- assertQ(req("title:solr-word"), "//*[@numFound='1']");
- assertQ(req("title:wolf-man"), "//*[@numFound='1']");
- assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
- }
-
- @Test
- public void testPdfWithImages() throws Exception {
- //Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
- loadLocal("extraction/pdf-with-image.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "pdfWithImage",
- "resource.name", "pdf-with-image.pdf",
- "resource.password", "solrRules",
- "fmap.Last-Modified", "extractedDate");
-
- assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
- assertU(commit());
- assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
- }
-
- @Test
- public void testPasswordProtected() throws Exception {
- // PDF, Passwords from resource.password
- loadLocal("extraction/encrypted-password-is-solrRules.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "pdfpwliteral",
- "resource.name", "encrypted-password-is-solrRules.pdf",
- "resource.password", "solrRules",
- "fmap.Last-Modified", "extractedDate");
-
- // PDF, Passwords from passwords property file
- loadLocal("extraction/encrypted-password-is-solrRules.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "pdfpwfile",
- "resource.name", "encrypted-password-is-solrRules.pdf",
- "passwordsFile", "passwordRegex.properties", // Passwords-file
- "fmap.Last-Modified", "extractedDate");
-
- // DOCX, Explicit password
- loadLocal("extraction/password-is-Word2010.docx",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "uprefix", "ignored_",
- "literal.id", "docxpwliteral",
- "resource.name", "password-is-Word2010.docx",
- "resource.password", "Word2010", // Explicit password
- "fmap.Last-Modified", "extractedDate");
-
- // DOCX, Passwords from file
- loadLocal("extraction/password-is-Word2010.docx",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "docxpwfile",
- "resource.name", "password-is-Word2010.docx",
- "passwordsFile", "passwordRegex.properties", // Passwords-file
- "fmap.Last-Modified", "extractedDate");
-
- assertU(commit());
- Thread.sleep(100);
- assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
- assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
- }
-
- SolrQueryResponse loadLocalFromHandler(String handler, String filename,
- String... args) throws Exception {
-
- LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
- try {
- // TODO: stop using locally defined streams once stream.file and
- // stream.body work everywhere
- List<ContentStream> cs = new ArrayList<>();
- cs.add(new ContentStreamBase.FileStream(getFile(filename)));
- req.setContentStreams(cs);
- return h.queryAndResponse(handler, req);
- } finally {
- req.close();
- }
- }
-
- SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
- return loadLocalFromHandler("/update/extract", filename, args);
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
deleted file mode 100644
index 8aeeaad..0000000
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import javax.xml.parsers.DocumentBuilderFactory;
-import java.nio.file.Paths;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.apache.solr.core.SolrResourceLoader;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.pdf.PDFParserConfig;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-
-public class ParseContextConfigTest extends SolrTestCaseJ4 {
-
- public void testAll() throws Exception {
- Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
- Element entries = document.createElement("entries");
- Element entry = document.createElement("entry");
-
-
- entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
- entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
-
- Element property = document.createElement("property");
-
- property.setAttribute("name", "extractInlineImages");
- property.setAttribute("value", "true");
- entry.appendChild(property);
- entries.appendChild(entry);
-
- ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader(Paths.get(".")), entries).create();
-
- PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
-
- assertEquals(true, pdfParserConfig.getExtractInlineImages());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
deleted file mode 100644
index 7d37844..0000000
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.time.Instant;
-import java.util.Date;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.QueryResponseWriter;
-import org.apache.solr.response.SolrQueryResponse;
-import org.apache.solr.response.RawResponseWriter;
-import org.apache.solr.search.SolrReturnFields;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
-
- private static XLSXResponseWriter writerXlsx;
-
- @BeforeClass
- public static void beforeClass() throws Exception {
- System.setProperty("enable.update.log", "false");
- initCore("solrconfig.xml","schema.xml",getFile("extraction/solr").getAbsolutePath());
- createIndex();
- //find a reference to the default response writer so we can redirect its output later
- SolrCore testCore = h.getCore();
- QueryResponseWriter writer = testCore.getQueryResponseWriter("xlsx");
- if (writer instanceof XLSXResponseWriter) {
- writerXlsx = (XLSXResponseWriter) testCore.getQueryResponseWriter("xlsx");
- } else {
- throw new Exception("XLSXResponseWriter not registered with solr core");
- }
- }
-
- public static void createIndex() {
- assertU(adoc("id","1", "foo_i","-1", "foo_s","hi", "foo_l","12345678987654321", "foo_b","false", "foo_f","1.414","foo_d","-1.0E300","foo_dt1","2000-01-02T03:04:05Z"));
- assertU(adoc("id","2", "v_ss","hi", "v_ss","there", "v2_ss","nice", "v2_ss","output", "shouldbeunstored","foo"));
- assertU(adoc("id","3", "shouldbeunstored","foo"));
- assertU(adoc("id","4", "foo_s1","foo"));
- assertU(commit());
- }
-
- @AfterClass
- public static void cleanupWriter() throws Exception {
- writerXlsx = null;
- }
-
- @Test
- public void testStructuredDataViaBaseWriters() throws IOException, Exception {
- SolrQueryResponse rsp = new SolrQueryResponse();
- // Don't send a ContentStream back, this will fall back to the configured base writer.
- // But abuse the CONTENT key to ensure writer is also checking type
- rsp.add(RawResponseWriter.CONTENT, "test");
- rsp.add("foo", "bar");
-
- SolrQueryRequest r = req();
-
- // check Content-Type
- assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", writerXlsx.getContentType(r, rsp));
-
- // test our basic types,and that fields come back in the requested order
- XSSFSheet resultSheet = getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1"));
-
- assertEquals("id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1\n1,hi,-1,12345678987654321,F,1.414,-1.0E300,2000-01-02T03:04:05Z\n"
- , getStringFromSheet(resultSheet));
-
- resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "fl","id,score,foo_s"));
- // test retrieving score
- assertEquals("id,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
-
- resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "colname.id", "I.D.", "colwidth.id", "10",
- "fl","id,score,foo_s"));
- // test override colname/width
- assertEquals("I.D.,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
- // test colwidth (value returned is in 256ths of a character as per excel standard)
- assertEquals(10*256, resultSheet.getColumnWidth(0));
-
- resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,v_ss"));
- // test multivalued
- assertEquals("id,v_ss\n2,hi; there\n", getStringFromSheet(resultSheet));
-
- // test retrieving fields from index
- resultSheet = getWSResultForQuery(req("q","*:*", "wt","xslx", "fl","*,score"));
- String result = getStringFromSheet(resultSheet);
- for (String field : "id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss,score".split(",")) {
- assertTrue(result.indexOf(field) >= 0);
- }
-
- // test null values
- resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,foo_s,v_ss"));
- assertEquals("id,foo_s,v_ss\n2,,hi; there\n", getStringFromSheet(resultSheet));
-
- // now test SolrDocumentList
- SolrDocument d = new SolrDocument();
- SolrDocument d1 = d;
- d.addField("id","1");
- d.addField("foo_i",-1);
- d.addField("foo_s","hi");
- d.addField("foo_l","12345678987654321L");
- d.addField("foo_b",false);
- d.addField("foo_f",1.414f);
- d.addField("foo_d",-1.0E300);
- d.addField("foo_dt1", new Date(Instant.parse("2000-01-02T03:04:05Z").toEpochMilli()));
- d.addField("score", "2.718");
-
- d = new SolrDocument();
- SolrDocument d2 = d;
- d.addField("id","2");
- d.addField("v_ss","hi");
- d.addField("v_ss","there");
- d.addField("v2_ss","nice");
- d.addField("v2_ss","output");
- d.addField("score", "89.83");
- d.addField("shouldbeunstored","foo");
-
- SolrDocumentList sdl = new SolrDocumentList();
- sdl.add(d1);
- sdl.add(d2);
-
- SolrQueryRequest req = req("q","*:*");
- rsp = new SolrQueryResponse();
- rsp.addResponse(sdl);
-
- rsp.setReturnFields( new SolrReturnFields("id,foo_s", req) );
-
- resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,foo_s\n1,hi\n2,\n", getStringFromSheet(resultSheet));
-
- // try scores
- rsp.setReturnFields( new SolrReturnFields("id,score,foo_s", req) );
-
- resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,score,foo_s\n1,2.718,hi\n2,89.83,\n", getStringFromSheet(resultSheet));
-
- // get field values from docs... should be ordered and not include score unless requested
- rsp.setReturnFields( new SolrReturnFields("*", req) );
-
- resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss\n" +
- "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z,,\n" +
- "2,,,,,,,,hi; there,nice; output\n", getStringFromSheet(resultSheet));
-
- // get field values and scores - just check that the scores are there... we don't guarantee where
- rsp.setReturnFields( new SolrReturnFields("*,score", req) );
- resultSheet = getWSResultForQuery(req, rsp);
- String s = getStringFromSheet(resultSheet);
- assertTrue(s.indexOf("score") >=0 && s.indexOf("2.718") > 0 && s.indexOf("89.83") > 0 );
-
- // Test field globs
- rsp.setReturnFields( new SolrReturnFields("id,foo*", req) );
- resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1\n" +
- "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z\n" +
- "2,,,,,,,\n", getStringFromSheet(resultSheet));
-
- rsp.setReturnFields( new SolrReturnFields("id,*_d*", req) );
- resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,foo_d,foo_dt1\n" +
- "1,-1.0E300,2000-01-02T03:04:05Z\n" +
- "2,,\n", getStringFromSheet(resultSheet));
-
- // Test function queries
- rsp.setReturnFields( new SolrReturnFields("sum(1,1),id,exists(foo_s1),div(9,1),foo_f", req) );
- resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("sum(1,1),id,exists(foo_s1),div(9,1),foo_f\n" +
- ",1,,,1.414\n" +
- ",2,,,\n", getStringFromSheet(resultSheet));
-
- // Test transformers
- rsp.setReturnFields( new SolrReturnFields("mydocid:[docid],[explain]", req) );
- resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("mydocid,[explain]\n" +
- ",\n" +
- ",\n", getStringFromSheet(resultSheet));
-
- req.close();
- }
-
-
- @Test
- public void testPseudoFields() throws Exception {
- // Use Pseudo Field
- SolrQueryRequest req = req("q","id:1", "wt","xlsx", "fl","XXX:id,foo_s");
- XSSFSheet resultSheet = getWSResultForQuery(req);
- assertEquals("XXX,foo_s\n1,hi\n", getStringFromSheet(resultSheet));
-
- String txt = getStringFromSheet(getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","XXX:id,YYY:[docid],FOO:foo_s")));
- String[] lines = txt.split("\n");
- assertEquals(2, lines.length);
- assertEquals("XXX,YYY,FOO", lines[0] );
- assertEquals("1,0,hi", lines[1] );
-
- //assertions specific to multiple pseudofields functions like abs, div, exists, etc.. (SOLR-5423)
- String funcText = getStringFromSheet(getWSResultForQuery(req("df", "text", "q","*", "wt","xlsx", "fl","XXX:id,YYY:exists(foo_s1)")));
- String[] funcLines = funcText.split("\n");
- assertEquals(5, funcLines.length);
- assertEquals("XXX,YYY", funcLines[0] );
- assertEquals("1,false", funcLines[1] );
- assertEquals("3,false", funcLines[3] );
- }
-
- // returns first worksheet as XLSXResponseWriter only returns one sheet
- private XSSFSheet getWSResultForQuery(SolrQueryRequest req) throws IOException, Exception {
- SolrQueryResponse rsp = h.queryAndResponse("", req);
- return getWSResultForQuery(req, rsp);
- }
-
- private XSSFSheet getWSResultForQuery(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, Exception {
- ByteArrayOutputStream xmlBout = new ByteArrayOutputStream();
- writerXlsx.write(xmlBout, req, rsp);
- XSSFWorkbook output = new XSSFWorkbook(new ByteArrayInputStream(xmlBout.toByteArray()));
- XSSFSheet sheet = output.getSheetAt(0);
- req.close();
- output.close();
- return sheet;
- }
-
- private String getStringFromSheet(XSSFSheet sheet) {
- StringBuilder output = new StringBuilder();
- for (Row row: sheet) {
- for (Cell cell: row) {
- output.append(cell.getStringCellValue());
- output.append(",");
- }
- output.setLength(output.length() - 1);
- output.append("\n");
- }
- return output.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf b/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf
new file mode 100644
index 0000000..3d47b99
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf b/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf
new file mode 100644
index 0000000..300a476
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/example.html
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/example.html b/solr/contrib/extraction/src/test/resources/extraction/example.html
new file mode 100644
index 0000000..5732f62
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/example.html
@@ -0,0 +1,49 @@
+<html>
+<head>
+ <title>Welcome to Solr</title>
+</head>
+<body>
+<p>
+ Here is some text
+</p>
+<div>Here is some text in a div</div>
+<div>This has a <a href="http://www.apache.org">link</a>.</div>
+<a href="#news">News</a>
+<ul class="minitoc">
+<li>
+<a href="#03+October+2008+-+Solr+Logo+Contest">03 October 2008 - Solr Logo Contest</a>
+</li>
+<li>
+<a href="#15+September+2008+-+Solr+1.3.0+Available">15 September 2008 - Solr 1.3.0 Available</a>
+</li>
+<li>
+<a href="#28+August+2008+-+Lucene%2FSolr+at+ApacheCon+New+Orleans">28 August 2008 - Lucene/Solr at ApacheCon New Orleans</a>
+</li>
+<li>
+<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
+</li>
+<li>
+<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
+</li>
+<li>
+<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
+</li>
+<li>
+<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
+</li>
+<li>
+<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
+</li>
+<li>
+<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
+</li>
+<li>
+<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
+</li>
+<li>
+<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
+</li>
+</ul>
+
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/open-document.odt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/open-document.odt b/solr/contrib/extraction/src/test/resources/extraction/open-document.odt
new file mode 100644
index 0000000..57f4369
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/open-document.odt differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx b/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx
new file mode 100644
index 0000000..24010d3
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx b/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx
new file mode 100644
index 0000000..2723d56
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf b/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf
new file mode 100644
index 0000000..b168951
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/simple.html
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/simple.html b/solr/contrib/extraction/src/test/resources/extraction/simple.html
new file mode 100644
index 0000000..3c807fb
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/simple.html
@@ -0,0 +1,19 @@
+<html>
+<head>
+ <title>Welcome to Solr</title>
+ <style type="text/css">
+ body { font-family: serif; }
+ </style>
+</head>
+<body>
+<p>
+ Here is some text
+</p>
+<p>distinct<br/>words</p>
+<div>Here is some text in a div</div>
+<div>This has a <a href="http://www.apache.org">link</a>.</div>
+</body>
+<script>
+ document.getElementById("div").blur();
+</script>
+</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf b/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf
new file mode 100644
index 0000000..bd8b865
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml
new file mode 100644
index 0000000..574c808
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<entries>
+ <entry class="org.apache.tika.parser.pdf.PDFParserConfig" impl="org.apache.tika.parser.pdf.PDFParserConfig">
+ <property name="extractInlineImages" value="true"/>
+ </entry>
+</entries>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties
new file mode 100644
index 0000000..8a4eff1
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties
@@ -0,0 +1,7 @@
+# Filename regex -> password map
+# Example any file ending in .doc should use password foobar:
+# .*\.doc = fooBar
+#
+# Note: Apache Tika 1.1 supports password for .pdf and .docx only, not .doc or other formats
+.*\.pdf$ = solrRules
+.*\.docx$ = Word2010
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt
new file mode 100644
index 0000000..7878147
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#use a protected word file to avoid stemming two
+#unrelated words to the same base word.
+#to test, we will use words that would normally obviously be stemmed.
+cats
+ridding
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml
new file mode 100644
index 0000000..475c333
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml
@@ -0,0 +1,484 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- The Solr schema file. This file should be named "schema.xml" and
+ should be located where the classloader for the Solr webapp can find it.
+
+ This schema is used for testing, and as such has everything and the
+ kitchen sink thrown in. See example/solr/conf/schema.xml for a
+ more concise example.
+
+ -->
+
+<schema name="test" version="1.0">
+
+
+ <!--
+ Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+ -->
+ <fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="float" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+
+ <!--
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
+ implementation details.
+
+ Smaller precisionStep values (specified in bits) will lead to more tokens
+ indexed per value, slightly larger index size, and faster range queries.
+ A precisionStep of 0 disables indexing at different precision levels.
+ -->
+ <fieldType name="tint" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+
+ <!-- Field type demonstrating an Analyzer failure -->
+ <fieldType name="failtype1" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Demonstrating ignoreCaseChange -->
+ <fieldType name="wdf_nocase" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="wdf_preserve" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+
+ <!-- HighlitText optimizes storage for (long) columns which will be highlit -->
+ <fieldType name="highlittext" class="solr.TextField" compressThreshold="345"/>
+
+ <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
+
+ <!-- format for date is 1995-12-31T23:59:59.999Z and only the fractional
+ seconds part (.999) is optional.
+ -->
+ <fieldType name="date" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" sortMissingLast="true" omitNorms="true"/>
+
+ <!-- solr.TextField allows the specification of custom
+ text analyzers specified as a tokenizer and a list
+ of token filters.
+ -->
+ <fieldType name="text" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.ClassicTokenizerFactory"/>
+ <filter class="solr.ClassicFilterFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+
+ <fieldType name="nametext" class="solr.TextField">
+ <analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
+ </fieldType>
+
+ <!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
+ <fieldType name="keywordtok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="standardtok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="lettertok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.LetterTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="whitetok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="HTMLstandardtok" class="solr.TextField">
+ <analyzer>
+ <charFilter class="solr.HTMLStripCharFilterFactory"/>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="HTMLwhitetok" class="solr.TextField">
+ <analyzer>
+ <charFilter class="solr.HTMLStripCharFilterFactory"/>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="standardtokfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.ClassicTokenizerFactory"/>
+ <filter class="solr.ClassicFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="standardfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.ClassicFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="lowerfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="patternreplacefilt" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+ <filter class="solr.PatternReplaceFilterFactory"
+ pattern="([^a-zA-Z])" replacement="_" replace="all"
+ />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="porterfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ <!-- fieldType name="snowballfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory"/>
+ </analyzer>
+ </fieldType -->
+ <fieldType name="engporterfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="custengporterfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="stopfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="custstopfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
+ </analyzer>
+ </fieldType>
+ <fieldType name="lengthfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LengthFilterFactory" min="2" max="5"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- more flexible in matching skus, but more chance of a false match -->
+ <fieldType name="skutype1" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- less flexible in matching skus, but less chance of a false match -->
+ <fieldType name="skutype2" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- less flexible in matching skus, but less chance of a false match -->
+ <fieldType name="syn" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
+ synonyms "better"
+ -->
+ <fieldType name="dedup" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory"
+ synonyms="synonyms.txt" expand="true"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory"
+ synonyms="synonyms.txt" expand="true"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="unstored" class="solr.StrField" indexed="true" stored="false"/>
+
+
+ <fieldType name="textgap" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+
+ <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
+ <field name="name" type="nametext" indexed="true" stored="true"/>
+ <field name="text" type="text" indexed="true" stored="false"/>
+ <field name="subject" type="text" indexed="true" stored="true"/>
+ <field name="title" type="nametext" indexed="true" stored="true"/>
+ <field name="weight" type="float" indexed="true" stored="true"/>
+ <field name="bday" type="date" indexed="true" stored="true"/>
+
+ <field name="title_stemmed" type="text" indexed="true" stored="false"/>
+ <field name="title_lettertok" type="lettertok" indexed="true" stored="false"/>
+
+ <field name="syn" type="syn" indexed="true" stored="true"/>
+
+ <!-- to test property inheritance and overriding -->
+ <field name="shouldbeunstored" type="unstored"/>
+ <field name="shouldbestored" type="unstored" stored="true"/>
+ <field name="shouldbeunindexed" type="unstored" indexed="false" stored="true"/>
+
+
+ <!-- test different combinations of indexed and stored -->
+ <field name="bind" type="boolean" indexed="true" stored="false"/>
+ <field name="bsto" type="boolean" indexed="false" stored="true"/>
+ <field name="bindsto" type="boolean" indexed="true" stored="true"/>
+ <field name="isto" type="int" indexed="false" stored="true"/>
+ <field name="iind" type="int" indexed="true" stored="false"/>
+ <field name="ssto" type="string" indexed="false" stored="true"/>
+ <field name="sind" type="string" indexed="true" stored="false"/>
+ <field name="sindsto" type="string" indexed="true" stored="true"/>
+
+ <!-- test combinations of term vector settings -->
+ <field name="test_basictv" type="text" termVectors="true"/>
+ <field name="test_notv" type="text" termVectors="false"/>
+ <field name="test_postv" type="text" termVectors="true" termPositions="true"/>
+ <field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
+ <field name="test_posofftv" type="text" termVectors="true"
+ termPositions="true" termOffsets="true"/>
+
+ <!-- test highlit field settings -->
+ <field name="test_hlt" type="highlittext" indexed="true"/>
+ <field name="test_hlt_off" type="highlittext" indexed="true"/>
+
+ <!-- fields to test individual tokenizers and tokenfilters -->
+ <field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
+ <field name="standardtok" type="standardtok" indexed="true" stored="true"/>
+ <field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
+ <field name="lettertok" type="lettertok" indexed="true" stored="true"/>
+ <field name="whitetok" type="whitetok" indexed="true" stored="true"/>
+ <field name="HTMLwhitetok" type="HTMLwhitetok" indexed="true" stored="true"/>
+ <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
+ <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
+ <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
+ <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
+ <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
+ <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
+ <field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>
+ <field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
+ <field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
+ <field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
+ <field name="dedup" type="dedup" indexed="true" stored="true"/>
+ <field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
+ <field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
+
+ <field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>
+
+ <field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
+
+ <field name="subword" type="subword" indexed="true" stored="true"/>
+ <field name="sku1" type="skutype1" indexed="true" stored="true"/>
+ <field name="sku2" type="skutype2" indexed="true" stored="true"/>
+
+ <field name="textgap" type="textgap" indexed="true" stored="true"/>
+
+ <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+ <field name="multiDefault" type="string" indexed="true" stored="true" default="muLti-Default" multiValued="true"/>
+ <field name="intDefault" type="int" indexed="true" stored="true" default="42" multiValued="false"/>
+
+ <field name="extractedDate" type="date" indexed="true" stored="true" multiValued="true"/>
+ <field name="extractedContent" type="text" indexed="true" stored="true" multiValued="true"/>
+ <field name="extractedProducer" type="text" indexed="true" stored="true" multiValued="true"/>
+ <field name="extractedCreator" type="text" indexed="true" stored="true" multiValued="true"/>
+ <field name="extractedKeywords" type="text" indexed="true" stored="true" multiValued="true"/>
+ <field name="extractedAuthor" type="text" indexed="true" stored="true" multiValued="true"/>
+ <field name="extractedLanguage" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="resourceName" type="string" indexed="true" stored="true" multiValued="true"/>
+
+ <field name="extractionLiteralMV" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="extractionLiteral" type="string" indexed="true" stored="true" multiValued="false"/>
+
+ <field name="defaultExtr" type="string" indexed="true" stored="false"/>
+
+ <!-- Dynamic field definitions. If a field name is not found, dynamicFields
+ will be used if the name matches any of the patterns.
+ RESTRICTION: the glob-like pattern in the name attribute must have
+ a "*" only at the start or the end.
+ EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
+ Longer patterns will be matched first. if equal size patterns
+ both match, the first appearing in the schema will be used.
+ -->
+ <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
+ <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
+ <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_s1" type="string" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
+ <dynamicField name="*_t" type="text" indexed="true" stored="true"/>
+ <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
+ <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
+ <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
+ <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
+ <dynamicField name="*_dt1" type="date" indexed="true" stored="true" multiValued="false"/>
+
+ <dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
+ <dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
+ <dynamicField name="t_*" type="text" indexed="true" stored="true"/>
+ <dynamicField name="tv_*" type="text" indexed="true" stored="true"
+ termVectors="true" termPositions="true" termOffsets="true"/>
+
+ <dynamicField name="stream_*" type="text" indexed="true" stored="true"/>
+ <dynamicField name="Content*" type="text" indexed="true" stored="true"/>
+
+
+ <!-- special fields for dynamic copyField test -->
+ <dynamicField name="dynamic_*" type="string" indexed="true" stored="true"/>
+ <dynamicField name="*_dynamic" type="string" indexed="true" stored="true"/>
+
+ <!-- for testing to ensure that longer patterns are matched first -->
+ <dynamicField name="*aa" type="string" indexed="true" stored="true"/>
+ <dynamicField name="*aaa" type="int" indexed="false" stored="true"/>
+
+ <!-- ignored because not stored or indexed -->
+ <dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
+
+
+ <uniqueKey>id</uniqueKey>
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field different
+ ways, or to add multiple fields to the same field for easier/faster searching.
+ -->
+ <copyField source="title" dest="title_stemmed"/>
+ <copyField source="title" dest="title_lettertok"/>
+
+ <copyField source="title" dest="text"/>
+ <copyField source="subject" dest="text"/>
+
+ <copyField source="*_t" dest="text"/>
+
+ <!-- dynamic destination -->
+ <copyField source="*_dynamic" dest="dynamic_*"/>
+
+
+</schema>