You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/02 15:27:29 UTC

[2/5] lucene-solr:jira/gradle: Add :solr:contrib:extraction module

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
deleted file mode 100644
index 132b371..0000000
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ /dev/null
@@ -1,777 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-import java.util.TimeZone;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.util.ContentStream;
-import org.apache.solr.common.util.ContentStreamBase;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.request.LocalSolrQueryRequest;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.SolrQueryResponse;
-import org.apache.solr.update.AddUpdateCommand;
-import org.apache.solr.update.processor.BufferingRequestProcessor;
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-
-/**
- *
- *
- **/
-public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
-
-  @BeforeClass
-  public static void beforeClass() throws Exception {
-    // Is the JDK/env affected by a known bug?
-    final String tzDisplayName = TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
-    if (!tzDisplayName.matches("[A-Z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) {
-      assertTrue("Is some other JVM affected?  Or bad regex? TzDisplayName: " + tzDisplayName,
-          System.getProperty("java.version").startsWith("11"));
-      assumeTrue("SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in extracting dates in a bad format.", false);
-    }
-
-    initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr").getAbsolutePath());
-  }
-
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    clearIndex();
-    assertU(commit());
-  }
-
-  @Test
-  public void testExtraction() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-    loadLocal("extraction/solr-word.pdf",
-            "fmap.created", "extractedDate",
-            "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Creation-Date", "extractedDate",
-            "uprefix", "ignored_",
-            "fmap.Author", "extractedAuthor",
-            "fmap.content", "extractedContent",
-           "literal.id", "one",
-            "fmap.Last-Modified", "extractedDate"
-    );
-    assertQ(req("title:solr-word"), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("title:solr-word"), "//*[@numFound='1']");
-
-
-    loadLocal("extraction/simple.html", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "fmap.language", "extractedLanguage",
-            "literal.id", "two",
-            "uprefix", "ignored_",
-            "fmap.content", "extractedContent",
-            "fmap.Last-Modified", "extractedDate"
-    );
-    assertQ(req("title:Welcome"), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("title:Welcome"), "//*[@numFound='1']");
-
-    assertQ(req("extractedContent:distinctwords"),      "//*[@numFound='0']");
-    assertQ(req("extractedContent:distinct"),           "//*[@numFound='1']");
-    assertQ(req("extractedContent:words"),              "//*[@numFound='2']");
-    assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");
-
-    loadLocal("extraction/simple.html",
-      "literal.id","simple2",
-      "uprefix", "t_",
-      "lowernames", "true",
-      "captureAttr", "true",
-      "fmap.a","t_href",
-      "fmap.content_type", "abcxyz",  // test that lowernames is applied before mapping, and uprefix is applied after mapping
-      "commit", "true"  // test immediate commit
-    );
-
-    // test that purposely causes a failure to print out the doc for test debugging
-    // assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
-
-    // test both lowernames and unknown field mapping
-    //assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
-    assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
-    assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
-    assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
-    assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded
-
-    // make sure the fact there is an index-time boost does not fail the parsing
-    loadLocal("extraction/simple.html",
-      "literal.id","simple3",
-      "uprefix", "t_",
-      "lowernames", "true",
-      "captureAttr", "true",  "fmap.a","t_href",
-      "commit", "true"
-
-      ,"boost.t_href", "100.0"
-    );
-
-    assertQ(req("t_href:http"), "//*[@numFound='2']");
-    assertQ(req("t_href:http"), "//doc[2]/str[.='simple3']");
-    assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix
-
-    loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "literal.id", "three",
-            "uprefix", "ignored_",
-            "fmap.content", "extractedContent",
-            "fmap.language", "extractedLanguage",
-            "fmap.Last-Modified", "extractedDate"
-    );
-    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
-
-    loadLocal("extraction/word2003.doc", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "literal.id", "four",
-            "uprefix", "ignored_",
-            "fmap.content", "extractedContent",
-            "fmap.language", "extractedLanguage",
-            "fmap.Last-Modified", "extractedDate"
-    );
-    assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='0']");
-    // There is already a PDF file with this content:
-    assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='1']");
-    assertU(commit());
-    assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='1']");
-    // now 2 of them:
-    assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='2']");
-
-    // compressed file
-    loadLocal("extraction/tiny.txt.gz", 
-              "fmap.created", "extractedDate", 
-              "fmap.producer", "extractedProducer",
-              "fmap.creator", "extractedCreator", 
-              "fmap.Keywords", "extractedKeywords",
-              "fmap.Author", "extractedAuthor",
-              "uprefix", "ignored_",
-              "fmap.content", "extractedContent",
-              "fmap.language", "extractedLanguage",
-              "fmap.Last-Modified", "extractedDate",
-              "literal.id", "tiny.txt.gz");
-    assertU(commit());
-    assertQ(req("id:tiny.txt.gz")
-            , "//*[@numFound='1']"
-            , "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']"
-            );
-
-    // compressed file
-    loadLocal("extraction/open-document.odt", 
-              "uprefix", "ignored_",
-              "fmap.content", "extractedContent",
-              "literal.id", "open-document");
-    assertU(commit());
-    assertQ(req("extractedContent:\"Práctica sobre GnuPG\"")
-            , "//*[@numFound='1']"
-            , "//*/arr[@name='stream_name']/str[.='open-document.odt']"
-            );
-  }
-
-  @Test
-  public void testCapture() throws Exception {
-    loadLocal("extraction/simple.html",
-        "literal.id","capture1",
-        "uprefix","t_",
-        "capture","div",
-        "fmap.div", "foo_t",
-        "commit", "true"
-    );
-    assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
-    assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
-
-    loadLocal("extraction/simple.html",
-        "literal.id", "capture2",
-        "captureAttr", "true",
-        "defaultField", "text",
-        "fmap.div", "div_t",
-        "fmap.a", "anchor_t",
-        "capture", "div",
-        "capture", "a",
-        "commit", "true"
-    );
-    assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
-    assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
-    assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
-    assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
-  }
-
-  @Test
-  public void testDefaultField() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-    try {
-      ignoreException("unknown field 'a'");
-      ignoreException("unknown field 'meta'");  // TODO: should this exception be happening?
-      loadLocal("extraction/simple.html",
-      "literal.id","simple2",
-      "lowernames", "true",
-        "captureAttr", "true",
-        //"fmap.content_type", "abcxyz",
-        "commit", "true"  // test immediate commit
-      );
-      fail("Should throw SolrException");
-    } catch (SolrException e) {
-      //do nothing
-    } finally {
-      resetExceptionIgnores();
-    }
-    
-
-    loadLocal("extraction/simple.html",
-      "literal.id","simple2",
-      ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
-      "lowernames", "true",
-      "captureAttr", "true",
-      //"fmap.content_type", "abcxyz",
-      "commit", "true"  // test immediate commit
-    );
-    assertQ(req("id:simple2"), "//*[@numFound='1']");
-    assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"), "//*[@numFound='1']");
-
-    //Test when both uprefix and default are specified.
-    loadLocal("extraction/simple.html",
-      "literal.id","simple2",
-      ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified
-            ExtractingParams.UNKNOWN_FIELD_PREFIX, "t_",
-      "lowernames", "true",
-      "captureAttr", "true",
-      "fmap.a","t_href",
-      //"fmap.content_type", "abcxyz",
-      "commit", "true"  // test immediate commit
-    );
-    assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
-  }
-
-  @Test
-  public void testLiterals() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-    //test literal
-    loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "fmap.content", "extractedContent",
-            "literal.id", "one",
-            "uprefix", "ignored_",
-            "fmap.language", "extractedLanguage",
-            "literal.extractionLiteralMV", "one",
-            "literal.extractionLiteralMV", "two",
-            "fmap.Last-Modified", "extractedDate"
-
-    );
-    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
-
-    assertQ(req("extractionLiteralMV:one"), "//*[@numFound='1']");
-    assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']");
-
-    try {
-      loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-              "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-              "fmap.Author", "extractedAuthor",
-              "fmap.content", "extractedContent",
-              "literal.id", "two",
-              "fmap.language", "extractedLanguage",
-              "literal.extractionLiteral", "one",
-              "literal.extractionLiteral", "two",
-              "fmap.X-Parsed-By", "ignored_parser",
-              "fmap.Last-Modified", "extractedDate"
-      );
-      // TODO: original author did not specify why an exception should be thrown... how to fix?
-      // assertTrue("Exception should have been thrown", false);
-    } catch (SolrException e) {
-      //nothing to see here, move along
-    }
-
-    loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "fmap.content", "extractedContent",
-            "literal.id", "three",
-            "fmap.language", "extractedLanguage",
-            "literal.extractionLiteral", "one",
-            "fmap.X-Parsed-By", "ignored_parser",
-            "fmap.Last-Modified", "extractedDate"
-    );
-    assertU(commit());
-    assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
-
-  }
-
-  public void testLiteralDefaults() throws Exception {
-
-    // sanity check config
-    loadLocalFromHandler("/update/extract/lit-def",
-                         "extraction/simple.html",
-                         "literal.id", "lit-def-simple");
-    assertU(commit());
-    assertQ(req("q", "id:lit-def-simple")
-            , "//*[@numFound='1']"
-            , "count(//arr[@name='foo_s']/str)=1"
-            , "//arr[@name='foo_s']/str[.='x']"
-            , "count(//arr[@name='bar_s']/str)=1"
-            , "//arr[@name='bar_s']/str[.='y']"
-            , "count(//arr[@name='zot_s']/str)=1"
-            , "//arr[@name='zot_s']/str[.='z']"
-            ); 
-    
-    // override the default foo_s
-    loadLocalFromHandler("/update/extract/lit-def",
-                         "extraction/simple.html",
-                         "literal.foo_s", "1111",
-                         "literal.id", "lit-def-simple");
-    assertU(commit());
-    assertQ(req("q", "id:lit-def-simple")
-            , "//*[@numFound='1']"
-            , "count(//arr[@name='foo_s']/str)=1"
-            , "//arr[@name='foo_s']/str[.='1111']"
-            , "count(//arr[@name='bar_s']/str)=1"
-            , "//arr[@name='bar_s']/str[.='y']"
-            , "count(//arr[@name='zot_s']/str)=1"
-            , "//arr[@name='zot_s']/str[.='z']"
-            ); 
-
-    // pre-pend the bar_s
-    loadLocalFromHandler("/update/extract/lit-def",
-                         "extraction/simple.html",
-                         "literal.bar_s", "2222",
-                         "literal.id", "lit-def-simple");
-    assertU(commit());
-    assertQ(req("q", "id:lit-def-simple")
-            , "//*[@numFound='1']"
-            , "count(//arr[@name='foo_s']/str)=1"
-            , "//arr[@name='foo_s']/str[.='x']"
-            , "count(//arr[@name='bar_s']/str)=2"
-            , "//arr[@name='bar_s']/str[.='2222']"
-            , "//arr[@name='bar_s']/str[.='y']"
-            , "count(//arr[@name='zot_s']/str)=1"
-            , "//arr[@name='zot_s']/str[.='z']"
-            ); 
-
-    // invariant zot_s can not be changed
-    loadLocalFromHandler("/update/extract/lit-def",
-                         "extraction/simple.html",
-                         "literal.zot_s", "3333",
-                         "literal.id", "lit-def-simple");
-    assertU(commit());
-    assertQ(req("q", "id:lit-def-simple")
-            , "//*[@numFound='1']"
-            , "count(//arr[@name='foo_s']/str)=1"
-            , "//arr[@name='foo_s']/str[.='x']"
-            , "count(//arr[@name='bar_s']/str)=1"
-            , "//arr[@name='bar_s']/str[.='y']"
-            , "count(//arr[@name='zot_s']/str)=1"
-            , "//arr[@name='zot_s']/str[.='z']"
-            ); 
-    
-  }
-
-  @Test
-  public void testPlainTextSpecifyingMimeType() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-
-    // Load plain text specifying MIME type:
-    loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "literal.id", "one",
-            "fmap.language", "extractedLanguage",
-            "fmap.X-Parsed-By", "ignored_parser",
-            "fmap.content", "extractedContent",
-            ExtractingParams.STREAM_TYPE, "text/plain"
-    );
-    assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
-  }
-
-  @Test
-  public void testPlainTextSpecifyingResourceName() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-
-    // Load plain text specifying filename
-    loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "literal.id", "one",
-            "fmap.language", "extractedLanguage",
-            "fmap.X-Parsed-By", "ignored_parser",
-            "fmap.content", "extractedContent",
-            ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt"
-    );
-    assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
-  }
-
-  @Test
-  public void testCommitWithin() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-    
-    SolrQueryRequest req = req("literal.id", "one",
-                               ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt",
-                               "commitWithin", "200"
-                               );
-    SolrQueryResponse rsp = new SolrQueryResponse();
-    BufferingRequestProcessor p = new BufferingRequestProcessor(null);
-
-    ExtractingDocumentLoader loader = (ExtractingDocumentLoader) handler.newLoader(req, p);
-    loader.load(req, rsp, new ContentStreamBase.FileStream(getFile("extraction/version_control.txt")),p);
-
-    AddUpdateCommand add = p.addCommands.get(0);
-    assertEquals(200, add.commitWithin);
-
-    req.close();
-  }
-
-  // Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's
-  // automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser
-
-  @Test
-  public void testExtractOnly() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-    SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true");
-    assertTrue("rsp is null and it shouldn't be", rsp != null);
-    NamedList list = rsp.getValues();
-
-    String extraction = (String) list.get("solr-word.pdf");
-    assertTrue("extraction is null and it shouldn't be", extraction != null);
-    assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
-
-    NamedList nl = (NamedList) list.get("solr-word.pdf_metadata");
-    assertTrue("nl is null and it shouldn't be", nl != null);
-    Object title = nl.get("title");
-    assertTrue("title is null and it shouldn't be", title != null);
-    assertTrue(extraction.indexOf("<?xml") != -1);
-
-    rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
-            ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
-    assertTrue("rsp is null and it shouldn't be", rsp != null);
-    list = rsp.getValues();
-
-    extraction = (String) list.get("solr-word.pdf");
-    assertTrue("extraction is null and it shouldn't be", extraction != null);
-    assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
-    assertTrue(extraction.indexOf("<?xml") == -1);
-
-    nl = (NamedList) list.get("solr-word.pdf_metadata");
-    assertTrue("nl is null and it shouldn't be", nl != null);
-    title = nl.get("title");
-    assertTrue("title is null and it shouldn't be", title != null);
-
-
-
-  }
-
-  @Test
-  public void testXPath() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-    SolrQueryResponse rsp = loadLocal("extraction/example.html",
-            ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
-            ExtractingParams.EXTRACT_ONLY, "true"
-    );
-    assertTrue("rsp is null and it shouldn't be", rsp != null);
-    NamedList list = rsp.getValues();
-    String val = (String) list.get("example.html");
-    assertEquals("News", val.trim()); //there is only one matching <a> tag
-
-    loadLocal("extraction/example.html",
-        "literal.id", "example1",
-        "captureAttr", "true",
-        "defaultField", "text",
-        "capture", "div",
-        "fmap.div", "foo_t",
-        "boost.foo_t", "3",
-        "xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
-        "commit", "true"
-    );
-    assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
-  }
-
-  /** test arabic PDF extraction is functional */
-  @Test
-  public void testArabicPDF() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) 
-      h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-
-    loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-        "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-        "fmap.Creation-Date", "extractedDate",
-        "fmap.Author", "extractedAuthor",
-        "uprefix", "ignored_",
-        "fmap.content", "wdf_nocase",
-       "literal.id", "one",
-        "fmap.Last-Modified", "extractedDate");
-    assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
-    assertU(commit());
-    assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
-  }
-
-  @Test
-  public void testTikaExceptionHandling() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) 
-      h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-
-    try{
-      loadLocal("extraction/password-is-solrcell.docx",
-          "literal.id", "one");
-      fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
-    }
-    catch(Exception expected){}
-    assertU(commit());
-    assertQ(req("*:*"), "//result[@numFound=0]");
-
-    try{
-      loadLocal("extraction/password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
-          "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
-          "fmap.Creation-Date", "extractedDate",
-          "uprefix", "ignored_",
-          "fmap.Author", "extractedAuthor",
-          "fmap.content", "wdf_nocase",
-          "literal.id", "one",
-          "ignoreTikaException", "true",  // set ignore flag
-          "fmap.Last-Modified", "extractedDate");
-    }
-    catch(Exception e){
-      fail("TikaException should be ignored.");
-    }
-    assertU(commit());
-    assertQ(req("*:*"), "//result[@numFound=1]");
-  }
-  
-  @Test
-  public void testWrongStreamType() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
-
-    try{
-      // Load plain text specifying another mime type, should fail
-      loadLocal("extraction/version_control.txt", 
-              "literal.id", "one",
-              ExtractingParams.STREAM_TYPE, "application/pdf"
-      );
-      fail("SolrException is expected because wrong parser specified for the file type");
-    }
-    catch(Exception expected){}
-
-    try{
-      // Load plain text specifying non existing mimetype, should fail
-      loadLocal("extraction/version_control.txt", 
-              "literal.id", "one",
-              ExtractingParams.STREAM_TYPE, "foo/bar"
-      );
-      fail("SolrException is expected because nonexsisting parser specified");
-    }
-    catch(Exception expected){}
-  }
-
-  public void testLiteralsOverride() throws Exception {
-    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertTrue("handler is null and it shouldn't be", handler != null);
- 
-    assertQ(req("*:*"), "//*[@numFound='0']");
-
-    // Here Tika should parse out a title for this document:
-    loadLocal("extraction/solr-word.pdf", 
-            "fmap.created", "extractedDate", 
-            "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator", 
-            "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "literal.id", "three",
-            "fmap.content", "extractedContent",
-            "fmap.language", "extractedLanguage",
-            "fmap.Creation-Date", "extractedDate",
-            "uprefix", "ignored_",
-            "fmap.Last-Modified", "extractedDate");
-
-    // Here the literal value should override the Tika-parsed title:
-    loadLocal("extraction/solr-word.pdf",
-            "literal.title", "wolf-man",
-            "fmap.created", "extractedDate",
-            "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator",
-            "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "literal.id", "four",
-            "fmap.content", "extractedContent",
-            "fmap.language", "extractedLanguage",
-            "fmap.Creation-Date", "extractedDate",
-            "uprefix", "ignored_",
-            "fmap.Last-Modified", "extractedDate");
-
-    // Here we mimic the old behaviour where literals are added, not overridden
-    loadLocal("extraction/solr-word.pdf",
-            "literalsOverride", "false",
-            // Trick - we first map the metadata-title to an ignored field before we replace with literal title
-            "fmap.title", "ignored_a",
-            "literal.title", "old-behaviour",
-            "literal.extractedKeywords", "literalkeyword",
-            "fmap.created", "extractedDate",
-            "fmap.producer", "extractedProducer",
-            "fmap.creator", "extractedCreator",
-            "fmap.Keywords", "extractedKeywords",
-            "fmap.Author", "extractedAuthor",
-            "literal.id", "five",
-            "fmap.content", "extractedContent",
-            "fmap.language", "extractedLanguage",
-            "fmap.Creation-Date", "extractedDate",
-            "uprefix", "ignored_",
-            "fmap.Last-Modified", "extractedDate");
-
-    assertU(commit());
-
-    assertQ(req("title:solr-word"), "//*[@numFound='1']");
-    assertQ(req("title:wolf-man"), "//*[@numFound='1']");
-    assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
-  }
-
-  @Test
-  public void testPdfWithImages() throws Exception {
-    //Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
-    loadLocal("extraction/pdf-with-image.pdf",
-        "fmap.created", "extractedDate",
-        "fmap.producer", "extractedProducer",
-        "fmap.creator", "extractedCreator",
-        "fmap.Keywords", "extractedKeywords",
-        "fmap.Creation-Date", "extractedDate",
-        "uprefix", "ignored_",
-        "fmap.Author", "extractedAuthor",
-        "fmap.content", "wdf_nocase",
-        "literal.id", "pdfWithImage",
-        "resource.name", "pdf-with-image.pdf",
-        "resource.password", "solrRules",
-        "fmap.Last-Modified", "extractedDate");
-
-    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
-  }
-
-  @Test
-  public void testPasswordProtected() throws Exception {
-    // PDF, Passwords from resource.password
-    loadLocal("extraction/encrypted-password-is-solrRules.pdf",
-        "fmap.created", "extractedDate", 
-        "fmap.producer", "extractedProducer",
-        "fmap.creator", "extractedCreator", 
-        "fmap.Keywords", "extractedKeywords",
-        "fmap.Creation-Date", "extractedDate",
-        "uprefix", "ignored_",
-        "fmap.Author", "extractedAuthor",
-        "fmap.content", "wdf_nocase",
-        "literal.id", "pdfpwliteral",
-        "resource.name", "encrypted-password-is-solrRules.pdf",
-        "resource.password", "solrRules",
-        "fmap.Last-Modified", "extractedDate");
-
-    // PDF, Passwords from passwords property file
-    loadLocal("extraction/encrypted-password-is-solrRules.pdf",
-        "fmap.created", "extractedDate", 
-        "fmap.producer", "extractedProducer",
-        "fmap.creator", "extractedCreator", 
-        "fmap.Keywords", "extractedKeywords",
-        "fmap.Creation-Date", "extractedDate",
-        "uprefix", "ignored_",
-        "fmap.Author", "extractedAuthor",
-        "fmap.content", "wdf_nocase",
-        "literal.id", "pdfpwfile",
-        "resource.name", "encrypted-password-is-solrRules.pdf",
-        "passwordsFile", "passwordRegex.properties", // Passwords-file
-        "fmap.Last-Modified", "extractedDate");
-
-    // DOCX, Explicit password
-    loadLocal("extraction/password-is-Word2010.docx", 
-        "fmap.created", "extractedDate", 
-        "fmap.producer", "extractedProducer",
-        "fmap.creator", "extractedCreator", 
-        "fmap.Keywords", "extractedKeywords",
-        "fmap.Creation-Date", "extractedDate",
-        "fmap.Author", "extractedAuthor",
-        "fmap.content", "wdf_nocase",
-        "uprefix", "ignored_",
-        "literal.id", "docxpwliteral",
-        "resource.name", "password-is-Word2010.docx",
-        "resource.password", "Word2010", // Explicit password
-        "fmap.Last-Modified", "extractedDate");
-
-    // DOCX, Passwords from file
-    loadLocal("extraction/password-is-Word2010.docx", 
-        "fmap.created", "extractedDate",
-        "fmap.producer", "extractedProducer",
-        "fmap.creator", "extractedCreator", 
-        "fmap.Keywords", "extractedKeywords",
-        "fmap.Creation-Date", "extractedDate",
-        "uprefix", "ignored_",
-        "fmap.Author", "extractedAuthor",
-        "fmap.content", "wdf_nocase",
-        "literal.id", "docxpwfile",
-        "resource.name", "password-is-Word2010.docx",
-        "passwordsFile", "passwordRegex.properties", // Passwords-file
-        "fmap.Last-Modified", "extractedDate");
-    
-    assertU(commit());
-    Thread.sleep(100);
-    assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
-    assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
-  }
-  
-  SolrQueryResponse loadLocalFromHandler(String handler, String filename, 
-                                         String... args) throws Exception {
-                              
-    LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
-    try {
-      // TODO: stop using locally defined streams once stream.file and
-      // stream.body work everywhere
-      List<ContentStream> cs = new ArrayList<>();
-      cs.add(new ContentStreamBase.FileStream(getFile(filename)));
-      req.setContentStreams(cs);
-      return h.queryAndResponse(handler, req);
-    } finally {
-      req.close();
-    }
-  }
-
-  SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
-    return loadLocalFromHandler("/update/extract", filename, args);
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
deleted file mode 100644
index 8aeeaad..0000000
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import javax.xml.parsers.DocumentBuilderFactory;
-import java.nio.file.Paths;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.apache.solr.core.SolrResourceLoader;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.pdf.PDFParserConfig;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-
-public class ParseContextConfigTest extends SolrTestCaseJ4 {
-
-  public void  testAll() throws Exception {
-    Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
-    Element entries = document.createElement("entries");
-    Element entry = document.createElement("entry");
-
-
-    entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
-    entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
-
-    Element property = document.createElement("property");
-
-    property.setAttribute("name", "extractInlineImages");
-    property.setAttribute("value", "true");
-    entry.appendChild(property);
-    entries.appendChild(entry);
-
-    ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader(Paths.get(".")), entries).create();
-
-    PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
-
-    assertEquals(true, pdfParserConfig.getExtractInlineImages());
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
deleted file mode 100644
index 7d37844..0000000
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.time.Instant;
-import java.util.Date;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-
-import org.apache.solr.SolrTestCaseJ4;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.QueryResponseWriter;
-import org.apache.solr.response.SolrQueryResponse;
-import org.apache.solr.response.RawResponseWriter;
-import org.apache.solr.search.SolrReturnFields;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
-
-  private static XLSXResponseWriter writerXlsx;
-
-  @BeforeClass
-  public static void beforeClass() throws Exception {
-    System.setProperty("enable.update.log", "false");
-    initCore("solrconfig.xml","schema.xml",getFile("extraction/solr").getAbsolutePath());
-    createIndex();
-    //find a reference to the default response writer so we can redirect its output later
-    SolrCore testCore = h.getCore();
-    QueryResponseWriter writer = testCore.getQueryResponseWriter("xlsx");
-    if (writer instanceof XLSXResponseWriter) {
-      writerXlsx = (XLSXResponseWriter) testCore.getQueryResponseWriter("xlsx");
-    } else {
-      throw new Exception("XLSXResponseWriter not registered with solr core");
-    }
-  }
-
-  public static void createIndex() {
-    assertU(adoc("id","1", "foo_i","-1", "foo_s","hi", "foo_l","12345678987654321", "foo_b","false", "foo_f","1.414","foo_d","-1.0E300","foo_dt1","2000-01-02T03:04:05Z"));
-    assertU(adoc("id","2", "v_ss","hi",  "v_ss","there", "v2_ss","nice", "v2_ss","output", "shouldbeunstored","foo"));
-    assertU(adoc("id","3", "shouldbeunstored","foo"));
-    assertU(adoc("id","4", "foo_s1","foo"));
-    assertU(commit());
-  }
-
-  @AfterClass
-  public static void cleanupWriter() throws Exception {
-    writerXlsx = null;
-  }
-
-  @Test
-  public void testStructuredDataViaBaseWriters() throws IOException, Exception {
-    SolrQueryResponse rsp = new SolrQueryResponse();
-    // Don't send a ContentStream back, this will fall back to the configured base writer.
-    // But abuse the CONTENT key to ensure writer is also checking type
-    rsp.add(RawResponseWriter.CONTENT, "test");
-    rsp.add("foo", "bar");
-
-    SolrQueryRequest r = req();
-
-    // check Content-Type
-    assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", writerXlsx.getContentType(r, rsp));
-
-    // test our basic types,and that fields come back in the requested order
-    XSSFSheet resultSheet = getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1"));
-
-    assertEquals("id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1\n1,hi,-1,12345678987654321,F,1.414,-1.0E300,2000-01-02T03:04:05Z\n"
-        , getStringFromSheet(resultSheet));
-
-    resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "fl","id,score,foo_s"));
-    // test retrieving score
-    assertEquals("id,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
-
-    resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx", "colname.id", "I.D.", "colwidth.id", "10",
-                                      "fl","id,score,foo_s"));
-    // test override colname/width
-    assertEquals("I.D.,score,foo_s\n1,0.0,hi\n", getStringFromSheet(resultSheet));
-    // test colwidth (value returned is in 256ths of a character as per excel standard)
-    assertEquals(10*256, resultSheet.getColumnWidth(0));
-
-    resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,v_ss"));
-    // test multivalued
-    assertEquals("id,v_ss\n2,hi; there\n", getStringFromSheet(resultSheet));
-
-    // test retrieving fields from index
-    resultSheet = getWSResultForQuery(req("q","*:*", "wt","xslx", "fl","*,score"));
-    String result = getStringFromSheet(resultSheet);
-    for (String field : "id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss,score".split(",")) {
-      assertTrue(result.indexOf(field) >= 0);
-    }
-
-    // test null values
-    resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx", "fl","id,foo_s,v_ss"));
-    assertEquals("id,foo_s,v_ss\n2,,hi; there\n", getStringFromSheet(resultSheet));
-
-    // now test SolrDocumentList
-    SolrDocument d = new SolrDocument();
-    SolrDocument d1 = d;
-    d.addField("id","1");
-    d.addField("foo_i",-1);
-    d.addField("foo_s","hi");
-    d.addField("foo_l","12345678987654321L");
-    d.addField("foo_b",false);
-    d.addField("foo_f",1.414f);
-    d.addField("foo_d",-1.0E300);
-    d.addField("foo_dt1", new Date(Instant.parse("2000-01-02T03:04:05Z").toEpochMilli()));
-    d.addField("score", "2.718");
-
-    d = new SolrDocument();
-    SolrDocument d2 = d;
-    d.addField("id","2");
-    d.addField("v_ss","hi");
-    d.addField("v_ss","there");
-    d.addField("v2_ss","nice");
-    d.addField("v2_ss","output");
-    d.addField("score", "89.83");
-    d.addField("shouldbeunstored","foo");
-
-    SolrDocumentList sdl = new SolrDocumentList();
-    sdl.add(d1);
-    sdl.add(d2);
-    
-    SolrQueryRequest req = req("q","*:*");
-    rsp = new SolrQueryResponse();
-    rsp.addResponse(sdl);
-
-    rsp.setReturnFields( new SolrReturnFields("id,foo_s", req) );
-
-    resultSheet = getWSResultForQuery(req, rsp);
-    assertEquals("id,foo_s\n1,hi\n2,\n", getStringFromSheet(resultSheet));
-
-    // try scores
-    rsp.setReturnFields( new SolrReturnFields("id,score,foo_s", req) );
-
-    resultSheet = getWSResultForQuery(req, rsp);
-    assertEquals("id,score,foo_s\n1,2.718,hi\n2,89.83,\n", getStringFromSheet(resultSheet));
-
-    // get field values from docs... should be ordered and not include score unless requested
-    rsp.setReturnFields( new SolrReturnFields("*", req) );
-
-    resultSheet = getWSResultForQuery(req, rsp);
-    assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss\n" +
-        "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z,,\n" +
-        "2,,,,,,,,hi; there,nice; output\n", getStringFromSheet(resultSheet));
-
-    // get field values and scores - just check that the scores are there... we don't guarantee where
-    rsp.setReturnFields( new SolrReturnFields("*,score", req) );
-    resultSheet = getWSResultForQuery(req, rsp);
-    String s = getStringFromSheet(resultSheet);
-    assertTrue(s.indexOf("score") >=0 && s.indexOf("2.718") > 0 && s.indexOf("89.83") > 0 );
-    
-    // Test field globs
-    rsp.setReturnFields( new SolrReturnFields("id,foo*", req) );
-    resultSheet = getWSResultForQuery(req, rsp);
-    assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1\n" +
-        "1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z\n" +
-        "2,,,,,,,\n", getStringFromSheet(resultSheet));
-
-    rsp.setReturnFields( new SolrReturnFields("id,*_d*", req) );
-    resultSheet = getWSResultForQuery(req, rsp);
-    assertEquals("id,foo_d,foo_dt1\n" +
-        "1,-1.0E300,2000-01-02T03:04:05Z\n" +
-        "2,,\n", getStringFromSheet(resultSheet));
-
-    // Test function queries
-    rsp.setReturnFields( new SolrReturnFields("sum(1,1),id,exists(foo_s1),div(9,1),foo_f", req) );
-    resultSheet = getWSResultForQuery(req, rsp);
-    assertEquals("sum(1,1),id,exists(foo_s1),div(9,1),foo_f\n" +
-        ",1,,,1.414\n" +
-        ",2,,,\n", getStringFromSheet(resultSheet));
-
-    // Test transformers
-    rsp.setReturnFields( new SolrReturnFields("mydocid:[docid],[explain]", req) );
-    resultSheet = getWSResultForQuery(req, rsp);
-    assertEquals("mydocid,[explain]\n" +
-        ",\n" +
-        ",\n", getStringFromSheet(resultSheet));
-
-    req.close();
-  }
-  
-
-  @Test
-  public void testPseudoFields() throws Exception {
-    // Use Pseudo Field
-    SolrQueryRequest req = req("q","id:1", "wt","xlsx", "fl","XXX:id,foo_s");
-    XSSFSheet resultSheet = getWSResultForQuery(req);
-    assertEquals("XXX,foo_s\n1,hi\n", getStringFromSheet(resultSheet));
-    
-    String txt = getStringFromSheet(getWSResultForQuery(req("q","id:1", "wt","xlsx", "fl","XXX:id,YYY:[docid],FOO:foo_s")));
-    String[] lines = txt.split("\n");
-    assertEquals(2, lines.length);
-    assertEquals("XXX,YYY,FOO", lines[0] );
-    assertEquals("1,0,hi", lines[1] );
-
-    //assertions specific to multiple pseudofields functions like abs, div, exists, etc.. (SOLR-5423)
-    String funcText = getStringFromSheet(getWSResultForQuery(req("df", "text", "q","*", "wt","xlsx", "fl","XXX:id,YYY:exists(foo_s1)")));
-    String[] funcLines = funcText.split("\n");
-    assertEquals(5, funcLines.length);
-    assertEquals("XXX,YYY", funcLines[0] );
-    assertEquals("1,false", funcLines[1] );
-    assertEquals("3,false", funcLines[3] );
-  }
-
-  // returns first worksheet as XLSXResponseWriter only returns one sheet
-  private XSSFSheet getWSResultForQuery(SolrQueryRequest req) throws IOException, Exception {
-    SolrQueryResponse rsp = h.queryAndResponse("", req);
-    return getWSResultForQuery(req, rsp);
-  }
-
-  private XSSFSheet getWSResultForQuery(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, Exception {
-    ByteArrayOutputStream xmlBout = new ByteArrayOutputStream();
-    writerXlsx.write(xmlBout, req, rsp);
-    XSSFWorkbook output = new XSSFWorkbook(new ByteArrayInputStream(xmlBout.toByteArray()));
-    XSSFSheet sheet = output.getSheetAt(0);
-    req.close();
-    output.close();
-    return sheet;
-  }
-
-  private String getStringFromSheet(XSSFSheet sheet) {
-    StringBuilder output = new StringBuilder();
-    for (Row row: sheet) {
-      for (Cell cell: row) {
-        output.append(cell.getStringCellValue());
-        output.append(",");
-      }
-      output.setLength(output.length() - 1);
-      output.append("\n");
-    }
-    return output.toString();
-  }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf b/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf
new file mode 100644
index 0000000..3d47b99
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/arabic.pdf differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf b/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf
new file mode 100644
index 0000000..300a476
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/encrypted-password-is-solrRules.pdf differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/example.html
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/example.html b/solr/contrib/extraction/src/test/resources/extraction/example.html
new file mode 100644
index 0000000..5732f62
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/example.html
@@ -0,0 +1,49 @@
+<html>
+<head>
+  <title>Welcome to Solr</title>
+</head>
+<body>
+<p>
+  Here is some text
+</p>
+<div>Here is some text in a div</div>
+<div>This has a <a href="http://www.apache.org">link</a>.</div>
+<a href="#news">News</a>
+<ul class="minitoc">
+<li>
+<a href="#03+October+2008+-+Solr+Logo+Contest">03 October 2008 - Solr Logo Contest</a>
+</li>
+<li>
+<a href="#15+September+2008+-+Solr+1.3.0+Available">15 September 2008 - Solr 1.3.0 Available</a>
+</li>
+<li>
+<a href="#28+August+2008+-+Lucene%2FSolr+at+ApacheCon+New+Orleans">28 August 2008 - Lucene/Solr at ApacheCon New Orleans</a>
+</li>
+<li>
+<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
+</li>
+<li>
+<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
+</li>
+<li>
+<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
+</li>
+<li>
+<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
+</li>
+<li>
+<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
+</li>
+<li>
+<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
+</li>
+<li>
+<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
+</li>
+<li>
+<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
+</li>
+</ul>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/open-document.odt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/open-document.odt b/solr/contrib/extraction/src/test/resources/extraction/open-document.odt
new file mode 100644
index 0000000..57f4369
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/open-document.odt differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx b/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx
new file mode 100644
index 0000000..24010d3
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/password-is-Word2010.docx differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx b/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx
new file mode 100644
index 0000000..2723d56
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/password-is-solrcell.docx differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf b/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf
new file mode 100644
index 0000000..b168951
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/pdf-with-image.pdf differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/simple.html
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/simple.html b/solr/contrib/extraction/src/test/resources/extraction/simple.html
new file mode 100644
index 0000000..3c807fb
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/simple.html
@@ -0,0 +1,19 @@
+<html>
+<head>
+  <title>Welcome to Solr</title>
+  <style type="text/css">
+    body { font-family: serif; }
+  </style>
+</head>
+<body>
+<p>
+  Here is some text
+</p>
+<p>distinct<br/>words</p>
+<div>Here is some text in a div</div>
+<div>This has a <a href="http://www.apache.org">link</a>.</div>
+</body>
+<script>
+  document.getElementById("div").blur();
+</script>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf b/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf
new file mode 100644
index 0000000..bd8b865
Binary files /dev/null and b/solr/contrib/extraction/src/test/resources/extraction/solr-word.pdf differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml
new file mode 100644
index 0000000..574c808
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/parseContext.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<entries>
+  <entry class="org.apache.tika.parser.pdf.PDFParserConfig" impl="org.apache.tika.parser.pdf.PDFParserConfig">
+    <property name="extractInlineImages" value="true"/>
+  </entry>
+</entries>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties
new file mode 100644
index 0000000..8a4eff1
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/passwordRegex.properties
@@ -0,0 +1,7 @@
+# Filename regex -> password map
+# Example any file ending in .doc should use password foobar:
+#  .*\.doc = fooBar
+#
+# Note: Apache Tika 1.1 supports password for .pdf and .docx only, not .doc or other formats 
+.*\.pdf$ = solrRules
+.*\.docx$ = Word2010
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt
new file mode 100644
index 0000000..7878147
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/protwords.txt
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#use a protected word file to avoid stemming two
+#unrelated words to the same base word.
+#to test, we will use words that would normally obviously be stemmed.
+cats
+ridding

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46fd24bf/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml
new file mode 100644
index 0000000..475c333
--- /dev/null
+++ b/solr/contrib/extraction/src/test/resources/extraction/solr/collection1/conf/schema.xml
@@ -0,0 +1,484 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- The Solr schema file. This file should be named "schema.xml" and
+     should be located where the classloader for the Solr webapp can find it.
+
+     This schema is used for testing, and as such has everything and the 
+     kitchen sink thrown in. See example/solr/conf/schema.xml for a 
+     more concise example.
+
+  -->
+
+<schema name="test" version="1.0">
+
+
+  <!--
+    Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+  -->
+  <fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+  <fieldType name="float" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+  <fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+  <fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+
+  <!--
+   Numeric field types that index each value at various levels of precision
+   to accelerate range queries when the number of values between the range
+   endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
+   implementation details.
+
+   Smaller precisionStep values (specified in bits) will lead to more tokens
+   indexed per value, slightly larger index size, and faster range queries.
+   A precisionStep of 0 disables indexing at different precision levels.
+  -->
+  <fieldType name="tint" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+  <fieldType name="tfloat" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+  <fieldType name="tlong" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+  <fieldType name="tdouble" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
+
+  <!-- Field type demonstrating an Analyzer failure -->
+  <fieldType name="failtype1" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+              catenateNumbers="0" catenateAll="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <!-- Demonstrating ignoreCaseChange -->
+  <fieldType name="wdf_nocase" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="wdf_preserve" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+              catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+
+  <!-- HighlitText optimizes storage for (long) columns which will be highlit -->
+  <fieldType name="highlittext" class="solr.TextField" compressThreshold="345"/>
+
+  <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+  <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
+
+  <!-- format for date is 1995-12-31T23:59:59.999Z and only the fractional
+       seconds part (.999) is optional.
+    -->
+  <fieldType name="date" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" sortMissingLast="true" omitNorms="true"/>
+
+  <!-- solr.TextField allows the specification of custom
+       text analyzers specified as a tokenizer and a list
+       of token filters.
+    -->
+  <fieldType name="text" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.ClassicTokenizerFactory"/>
+      <filter class="solr.ClassicFilterFactory"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.StopFilterFactory"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+
+  <fieldType name="nametext" class="solr.TextField">
+    <analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
+  </fieldType>
+
+  <!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
+  <fieldType name="keywordtok" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="standardtok" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="lettertok" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.LetterTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="whitetok" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="HTMLstandardtok" class="solr.TextField">
+    <analyzer>
+      <charFilter class="solr.HTMLStripCharFilterFactory"/>
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="HTMLwhitetok" class="solr.TextField">
+    <analyzer>
+      <charFilter class="solr.HTMLStripCharFilterFactory"/>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="standardtokfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.ClassicTokenizerFactory"/>
+      <filter class="solr.ClassicFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="standardfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.ClassicFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="lowerfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="patternreplacefilt" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+      <filter class="solr.PatternReplaceFilterFactory"
+              pattern="([^a-zA-Z])" replacement="_" replace="all"
+      />
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="porterfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <!-- fieldType name="snowballfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.SnowballPorterFilterFactory"/>
+    </analyzer>
+  </fieldType -->
+  <fieldType name="engporterfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="custengporterfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="stopfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.StopFilterFactory" ignoreCase="true"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="custstopfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
+    </analyzer>
+  </fieldType>
+  <fieldType name="lengthfilt" class="solr.TextField">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.LengthFilterFactory" min="2" max="5"/>
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+              catenateNumbers="1" catenateAll="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.StopFilterFactory"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+              catenateNumbers="0" catenateAll="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.StopFilterFactory"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <!-- more flexible in matching skus, but more chance of a false match -->
+  <fieldType name="skutype1" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+              catenateNumbers="1" catenateAll="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+              catenateNumbers="1" catenateAll="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <!-- less flexible in matching skus, but less chance of a false match -->
+  <fieldType name="skutype2" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+              catenateNumbers="1" catenateAll="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+              catenateNumbers="1" catenateAll="0"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <!-- less flexible in matching skus, but less chance of a false match -->
+  <fieldType name="syn" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"/>
+    </analyzer>
+  </fieldType>
+
+  <!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
+       synonyms "better"
+    -->
+  <fieldType name="dedup" class="solr.TextField">
+    <analyzer type="index">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.SynonymGraphFilterFactory"
+              synonyms="synonyms.txt" expand="true"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      <filter class="solr.FlattenGraphFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.SynonymGraphFilterFactory"
+              synonyms="synonyms.txt" expand="true"/>
+      <filter class="solr.PorterStemFilterFactory"/>
+      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+  <fieldType name="unstored" class="solr.StrField" indexed="true" stored="false"/>
+
+
+  <fieldType name="textgap" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+    <analyzer>
+      <tokenizer class="solr.MockTokenizerFactory"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+  </fieldType>
+
+
+  <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
+  <field name="name" type="nametext" indexed="true" stored="true"/>
+  <field name="text" type="text" indexed="true" stored="false"/>
+  <field name="subject" type="text" indexed="true" stored="true"/>
+  <field name="title" type="nametext" indexed="true" stored="true"/>
+  <field name="weight" type="float" indexed="true" stored="true"/>
+  <field name="bday" type="date" indexed="true" stored="true"/>
+
+  <field name="title_stemmed" type="text" indexed="true" stored="false"/>
+  <field name="title_lettertok" type="lettertok" indexed="true" stored="false"/>
+
+  <field name="syn" type="syn" indexed="true" stored="true"/>
+
+  <!-- to test property inheritance and overriding -->
+  <field name="shouldbeunstored" type="unstored"/>
+  <field name="shouldbestored" type="unstored" stored="true"/>
+  <field name="shouldbeunindexed" type="unstored" indexed="false" stored="true"/>
+
+
+  <!-- test different combinations of indexed and stored -->
+  <field name="bind" type="boolean" indexed="true" stored="false"/>
+  <field name="bsto" type="boolean" indexed="false" stored="true"/>
+  <field name="bindsto" type="boolean" indexed="true" stored="true"/>
+  <field name="isto" type="int" indexed="false" stored="true"/>
+  <field name="iind" type="int" indexed="true" stored="false"/>
+  <field name="ssto" type="string" indexed="false" stored="true"/>
+  <field name="sind" type="string" indexed="true" stored="false"/>
+  <field name="sindsto" type="string" indexed="true" stored="true"/>
+
+  <!-- test combinations of term vector settings -->
+  <field name="test_basictv" type="text" termVectors="true"/>
+  <field name="test_notv" type="text" termVectors="false"/>
+  <field name="test_postv" type="text" termVectors="true" termPositions="true"/>
+  <field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
+  <field name="test_posofftv" type="text" termVectors="true"
+         termPositions="true" termOffsets="true"/>
+
+  <!-- test highlit field settings -->
+  <field name="test_hlt" type="highlittext" indexed="true"/>
+  <field name="test_hlt_off" type="highlittext" indexed="true"/>
+
+  <!-- fields to test individual tokenizers and tokenfilters -->
+  <field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
+  <field name="standardtok" type="standardtok" indexed="true" stored="true"/>
+  <field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
+  <field name="lettertok" type="lettertok" indexed="true" stored="true"/>
+  <field name="whitetok" type="whitetok" indexed="true" stored="true"/>
+  <field name="HTMLwhitetok" type="HTMLwhitetok" indexed="true" stored="true"/>
+  <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
+  <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
+  <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
+  <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
+  <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
+  <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
+  <field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>
+  <field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
+  <field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
+  <field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
+  <field name="dedup" type="dedup" indexed="true" stored="true"/>
+  <field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
+  <field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
+
+  <field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>
+
+  <field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true"/>
+
+  <field name="subword" type="subword" indexed="true" stored="true"/>
+  <field name="sku1" type="skutype1" indexed="true" stored="true"/>
+  <field name="sku2" type="skutype2" indexed="true" stored="true"/>
+
+  <field name="textgap" type="textgap" indexed="true" stored="true"/>
+
+  <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+  <field name="multiDefault" type="string" indexed="true" stored="true" default="muLti-Default" multiValued="true"/>
+  <field name="intDefault" type="int" indexed="true" stored="true" default="42" multiValued="false"/>
+
+  <field name="extractedDate" type="date" indexed="true" stored="true" multiValued="true"/>
+  <field name="extractedContent" type="text" indexed="true" stored="true" multiValued="true"/>
+  <field name="extractedProducer" type="text" indexed="true" stored="true" multiValued="true"/>
+  <field name="extractedCreator" type="text" indexed="true" stored="true" multiValued="true"/>
+  <field name="extractedKeywords" type="text" indexed="true" stored="true" multiValued="true"/>
+  <field name="extractedAuthor" type="text" indexed="true" stored="true" multiValued="true"/>
+  <field name="extractedLanguage" type="string" indexed="true" stored="true" multiValued="true"/>
+  <field name="resourceName" type="string" indexed="true" stored="true" multiValued="true"/>
+
+  <field name="extractionLiteralMV" type="string" indexed="true" stored="true" multiValued="true"/>
+  <field name="extractionLiteral" type="string" indexed="true" stored="true" multiValued="false"/>
+
+  <field name="defaultExtr" type="string" indexed="true" stored="false"/>
+
+  <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
+       will be used if the name matches any of the patterns.
+       RESTRICTION: the glob-like pattern in the name attribute must have
+       a "*" only at the start or the end.
+       EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
+       Longer patterns will be matched first.  if equal size patterns
+       both match, the first appearing in the schema will be used.
+  -->
+  <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
+  <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
+  <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
+  <dynamicField name="*_s1" type="string" indexed="true" stored="true" multiValued="false"/>
+  <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
+  <dynamicField name="*_t" type="text" indexed="true" stored="true"/>
+  <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
+  <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
+  <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
+  <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
+  <dynamicField name="*_dt1" type="date" indexed="true" stored="true" multiValued="false"/>
+
+  <dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
+  <dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
+  <dynamicField name="t_*" type="text" indexed="true" stored="true"/>
+  <dynamicField name="tv_*" type="text" indexed="true" stored="true"
+                termVectors="true" termPositions="true" termOffsets="true"/>
+
+  <dynamicField name="stream_*" type="text" indexed="true" stored="true"/>
+  <dynamicField name="Content*" type="text" indexed="true" stored="true"/>
+
+
+  <!-- special fields for dynamic copyField test -->
+  <dynamicField name="dynamic_*" type="string" indexed="true" stored="true"/>
+  <dynamicField name="*_dynamic" type="string" indexed="true" stored="true"/>
+
+  <!-- for testing to ensure that longer patterns are matched first -->
+  <dynamicField name="*aa" type="string" indexed="true" stored="true"/>
+  <dynamicField name="*aaa" type="int" indexed="false" stored="true"/>
+
+  <!-- ignored because not stored or indexed -->
+  <dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
+
+
+  <uniqueKey>id</uniqueKey>
+
+  <!-- copyField commands copy one field to another at the time a document
+        is added to the index.  It's used either to index the same field different
+        ways, or to add multiple fields to the same field for easier/faster searching.
+   -->
+  <copyField source="title" dest="title_stemmed"/>
+  <copyField source="title" dest="title_lettertok"/>
+
+  <copyField source="title" dest="text"/>
+  <copyField source="subject" dest="text"/>
+
+  <copyField source="*_t" dest="text"/>
+
+  <!-- dynamic destination -->
+  <copyField source="*_dynamic" dest="dynamic_*"/>
+
+
+</schema>