You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2013/12/19 20:13:47 UTC

svn commit: r1552398 - in /lucene/dev/trunk/solr/contrib: map-reduce/ map-reduce/src/test-files/test-documents/ map-reduce/src/test-files/test-morphlines/ map-reduce/src/test/org/apache/solr/hadoop/ morphlines-cell/src/java/org/apache/solr/morphlines/c...

Author: markrmiller
Date: Thu Dec 19 19:13:47 2013
New Revision: 1552398

URL: http://svn.apache.org/r1552398
Log:
SOLR-1301: Merge Morphlines modules up to Kite 0.10 and CDK 0.9

Added:
    lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv   (with props)
    lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv.gz   (with props)
    lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.tar.gz   (with props)
    lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/email.eml   (with props)
    lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822   (with props)
Modified:
    lucene/dev/trunk/solr/contrib/map-reduce/build.xml
    lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf
    lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java
    lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java
    lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java
    lucene/dev/trunk/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java
    lucene/dev/trunk/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java
    lucene/dev/trunk/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java
    lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java
    lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java

Modified: lucene/dev/trunk/solr/contrib/map-reduce/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/build.xml?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/build.xml (original)
+++ lucene/dev/trunk/solr/contrib/map-reduce/build.xml Thu Dec 19 19:13:47 2013
@@ -92,6 +92,7 @@
   <path id="test.classpath">
     <path refid="solr.test.base.classpath"/>
     <path refid="classpath.additions"/>
+    <pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/test"/>
     <fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/>
   </path>
 

Added: lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv?rev=1552398&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv (added)
+++ lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv Thu Dec 19 19:13:47 2013
@@ -0,0 +1,6 @@
+Age,Color,Extras,Type,Used
+2,blue,GPS,"Gas, with electric",""
+10,green,"Labeled ""Vintage, 1913""",,yes
+100,red,"Labeled ""Vintage 1913""",yes
+5,orange,none,"This is a
+multi, line text",no
\ No newline at end of file

Added: lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv.gz
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv.gz?rev=1552398&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.tar.gz
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/cars.tar.gz?rev=1552398&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/email.eml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/email.eml?rev=1552398&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/email.eml (added)
+++ lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/email.eml Thu Dec 19 19:13:47 2013
@@ -0,0 +1,40 @@
+MIME-Version: 1.0
+Received: by 10.216.199.5 with HTTP; Wed, 27 Nov 2013 12:01:23 -0800
+(PST)
+Date: Wed, 27 Nov 2013 13:01:23 -0700
+Delivered-To: foo@cloudera.com
+Message-ID:
+<CA...@mail.gmail.com>
+Subject: Test EML
+From: Patrick Foo <fo...@cloudera.com>
+To: Patrick Foo <fo...@cloudera.com>
+Content-Type: multipart/alternative;
+boundary=001a11c3815cb55dda04ec2e0f3b
+
+--001a11c3815cb55dda04ec2e0f3b
+Content-Type: text/plain; charset=ISO-8859-1
+
+This is a test
+
+-- 
+Patrick Foo
+Customer Operations Engineer
+
+<http://www.cloudera.com>
+
+--001a11c3815cb55dda04ec2e0f3b
+Content-Type: text/html; charset=ISO-8859-1
+Content-Transfer-Encoding: quoted-printable
+
+<div dir=3D"ltr">This is a test<br clear=3D"all"><div><br></div>--
+<br><div=
+ dir=3D"ltr">Patrick Foo<div>Customer Operations
+Engineer</div><div><br>=
+</div><div><a href=3D"http://www.cloudera.com" target=3D"_blank"><img
+src=
+=3D"http://files.cloudera.com.s3.amazonaws.com/New%20Branding/cloudera-smal=
+l.png"></a><br>
+</div></div>
+</div>
+
+--001a11c3815cb55dda04ec2e0f3b--

Added: lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822?rev=1552398&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822 (added)
+++ lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822 Thu Dec 19 19:13:47 2013
@@ -0,0 +1,41 @@
+From: "Julien Nioche (JIRA)" <ji...@apache.org>
+To: dev@tika.apache.org
+Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
+Reply-To: dev@tika.apache.org
+Delivered-To: mailing list dev@tika.apache.org
+Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
+In-Reply-To: <60...@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+
+    [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ] 
+
+Julien Nioche commented on TIKA-461:
+------------------------------------
+
+I'll have a look at mime4j and try to use it in Tika
+
+> RFC822 messages not parsed
+> --------------------------
+>
+>                 Key: TIKA-461
+>                 URL: https://issues.apache.org/jira/browse/TIKA-461
+>             Project: Tika
+>          Issue Type: Bug
+>          Components: parser
+>    Affects Versions: 0.7
+>            Reporter: Joshua Turner
+>            Assignee: Julien Nioche
+>
+> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
+> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
+
+-- 
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+

Modified: lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf (original)
+++ lucene/dev/trunk/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf Thu Dec 19 19:13:47 2013
@@ -78,6 +78,32 @@ morphlines : [
               ]
             }      
 
+            {
+              commands : [ 
+                { 
+                  readCSV { 
+                    supportedMimeTypes : [text/csv]
+                    charset : UTF-8
+                    ignoreFirstLine : false 
+                    columns : [ user_screen_name, text ]
+                  } 
+                }
+                    
+                { 
+                  generateUUID { 
+                    field : id 
+                    preserveExisting : false
+                  } 
+                }
+                     
+                {
+                  sanitizeUnknownSolrFields {
+                    solrLocator : ${SOLR_LOCATOR}
+                  }
+                }  
+              ]
+            }      
+
             { 
               commands : [
                 { 
@@ -180,6 +206,7 @@ morphlines : [
                     # the parser is chosen that is closest to the bottom in this list:
                     parsers : [                    
                       { parser : org.apache.tika.parser.asm.ClassParser }
+                      # { parser : org.apache.tika.parser.AutoDetectParser }                      
                       # { parser : org.gagravarr.tika.OggParser, additionalSupportedMimeTypes : [audio/ogg] }
                       { parser : org.gagravarr.tika.FlacParser }
                       { parser : org.apache.tika.parser.audio.AudioParser }                      
@@ -218,6 +245,7 @@ morphlines : [
                       { parser : org.apache.tika.parser.xml.DcXMLParser }
                       { parser : org.apache.tika.parser.xml.FictionBookParser }
                       { parser : org.apache.tika.parser.chm.ChmParser }                          
+                      #{ parser : org.apache.tika.parser.AutoDetectParser }                          
                     ] 
                   }
                 }

Modified: lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java (original)
+++ lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java Thu Dec 19 19:13:47 2013
@@ -23,6 +23,7 @@ import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.hadoop.morphline.MorphlineMapRunner;
+import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
 import org.apache.solr.util.ExternalPaths;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -50,17 +51,10 @@ public abstract class MRUnitBase extends
     new File(tempDir).mkdirs();
     FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
     
-    setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes");
+    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
     
     config.set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, tempDir + "/test-morphlines/solrCellDocumentTypes.conf");
     config.set(SolrOutputFormat.ZIP_NAME, solrHomeZip.getName());
   }
   
-  public static void setupMorphline(String tempDir, String file) throws IOException {
-    String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
-    morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath());
-    morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}",  "{ collection : collection1 }");
-    
-    FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
-  }
 }

Modified: lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java (original)
+++ lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java Thu Dec 19 19:13:47 2013
@@ -42,6 +42,7 @@ import org.apache.lucene.util.LuceneTest
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.cloud.AbstractZkTestCase;
 import org.apache.solr.hadoop.hack.MiniMRCluster;
+import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
 import org.apache.solr.util.ExternalPaths;
 import org.junit.After;
 import org.junit.AfterClass;
@@ -125,7 +126,7 @@ public class MorphlineBasicMiniMRTest ex
     new File(tempDir).mkdirs();
     FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
     
-    MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes");
+    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
     
     System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath());
     

Modified: lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java (original)
+++ lucene/dev/trunk/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java Thu Dec 19 19:13:47 2013
@@ -67,6 +67,7 @@ import org.apache.solr.common.params.Mod
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.hadoop.hack.MiniMRClientCluster;
 import org.apache.solr.hadoop.hack.MiniMRClientClusterFactory;
+import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
 import org.apache.solr.util.ExternalPaths;
 import org.junit.After;
 import org.junit.AfterClass;
@@ -142,7 +143,7 @@ public class MorphlineGoLiveMiniMRTest e
     new File(tempDir).mkdirs();
     FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
     
-    MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes");
+    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
     
     
     System.setProperty("hadoop.log.dir", new File(dataDir, "logs").getAbsolutePath());

Modified: lucene/dev/trunk/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java (original)
+++ lucene/dev/trunk/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java Thu Dec 19 19:13:47 2013
@@ -223,16 +223,12 @@ public final class SolrCellBuilder imple
       
       ParseContext parseContext = new ParseContext();
       
-      // necessary for gzipped files or tar files, etc! copied from TikaCLI
-      parseContext.set(Parser.class, parser);
-      
       Metadata metadata = new Metadata();
       for (Entry<String, Object> entry : record.getFields().entries()) {
         metadata.add(entry.getKey(), entry.getValue().toString());
       }
 
       SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
-      
       try {
         inputStream = TikaInputStream.get(inputStream);
 

Modified: lucene/dev/trunk/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java Thu Dec 19 19:13:47 2013
@@ -18,6 +18,7 @@ package org.apache.solr.morphlines.cell;
 
 import java.io.File;
 import java.util.HashMap;
+import java.util.LinkedHashMap;
 import java.util.Map;
 
 import org.apache.commons.io.FileUtils;
@@ -37,7 +38,7 @@ import org.junit.Test;
 public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
 
   private Map<String,Integer> expectedRecords = new HashMap<String,Integer>();
-
+  private Map<String, Map<String, Object>> expectedRecordContents = new HashMap<String, Map<String, Object>>();
   @BeforeClass
   public static void beforeClass2() {
     assumeFalse("FIXME: This test fails under Java 8 due to the Saxon dependency - see SOLR-1301", Constants.JRE_IS_MINIMUM_JAVA8);
@@ -47,16 +48,17 @@ public class SolrCellMorphlineTest exten
   @Before
   public void setUp() throws Exception {
     super.setUp();
+    
     String path = RESOURCES_DIR + "/test-documents";
     expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2);
     expectedRecords.put(path + "/sample-statuses-20120906-141433", 2);
     expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2);
     expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2);
-    expectedRecords.put(path + "/cars.csv", 5);
-    expectedRecords.put(path + "/cars.csv.gz", 5);
+    expectedRecords.put(path + "/cars.csv", 6);
+    expectedRecords.put(path + "/cars.csv.gz", 6);
     expectedRecords.put(path + "/cars.tar.gz", 4);
-    expectedRecords.put(path + "/cars.tsv", 5);
-    expectedRecords.put(path + "/cars.ssv", 5);
+    expectedRecords.put(path + "/cars.tsv", 6);
+    expectedRecords.put(path + "/cars.ssv", 6);
     expectedRecords.put(path + "/test-documents.7z", 9);
     expectedRecords.put(path + "/test-documents.cpio", 9);
     expectedRecords.put(path + "/test-documents.tar", 9);
@@ -65,12 +67,80 @@ public class SolrCellMorphlineTest exten
     expectedRecords.put(path + "/test-documents.zip", 9);
     expectedRecords.put(path + "/multiline-stacktrace.log", 4);
     
+    {
+      Map<String, Object> record = new LinkedHashMap();
+      record.put("ignored__attachment_mimetype", "image/jpeg");
+      record.put("ignored_exif_isospeedratings", "400");
+      record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
+      record.put("ignored_tiff_model", "Canon EOS 40D");
+      record.put("text", NON_EMPTY_FIELD);
+      expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
+      expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
+      expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
+    }
+    
+    {
+      String file = path + "/testWORD_various.doc";
+      Map<String, Object> record = new LinkedHashMap();
+      record.put("ignored__attachment_mimetype", "application/msword");
+      record.put("ignored_author", "Michael McCandless");
+      record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
+      record.put("ignored_title", "");
+      record.put("ignored_keywords", "Keyword1 Keyword2");
+      record.put("ignored_subject", "Subject is here");
+      record.put("text", NON_EMPTY_FIELD);
+      expectedRecordContents.put(file, record);
+    }
+    
+    {
+      String file = path + "/testPDF.pdf";
+      Map<String, Object> record = new LinkedHashMap();
+      record.put("ignored__attachment_mimetype", "application/pdf");
+      record.put("ignored_author", "Bertrand Delacrétaz");
+      record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
+      record.put("ignored_title", "Apache Tika - Apache Tika");
+      record.put("ignored_xmp_creatortool", "Firefox");
+      record.put("text", NON_EMPTY_FIELD);
+      expectedRecordContents.put(file, record);
+    }
+    
+    {
+      String file = path + "/email.eml";
+      Map<String, Object> record = new LinkedHashMap();
+      String name = "Patrick Foo <fo...@cloudera.com>";
+      record.put("ignored__attachment_mimetype", "message/rfc822");
+      record.put("ignored_author", name);
+      //record.put("ignored_content_length", "1068");
+      record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
+      record.put("ignored_message_from", name);
+      record.put("ignored_message_to", name);
+      record.put("ignored_creator", name);
+      record.put("ignored_dc_creator", name);
+      record.put("ignored_dc_title", "Test EML");
+      record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
+      record.put("ignored_meta_author", name);
+      record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
+      record.put("ignored_subject", "Test EML");
+      record.put("text", NON_EMPTY_FIELD);
+      expectedRecordContents.put(file, record);
+    }
+
+    {
+      String file = path + "/testEXCEL.xlsx";
+      Map<String, Object> record = new LinkedHashMap();
+      record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+      record.put("ignored_author", "Keith Bennett");
+      record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
+      record.put("ignored_title", "Simple Excel document");
+      record.put("text", NON_EMPTY_FIELD);
+      expectedRecordContents.put(file, record);
+    }    
+    
     FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
   }
   
   @Test
   public void testSolrCellJPGCompressed() throws Exception {
-    
     morphline = createMorphline("test-morphlines/solrCellJPGCompressed");    
     String path = RESOURCES_DIR + "/test-documents";
     String[] files = new String[] {
@@ -79,7 +149,7 @@ public class SolrCellMorphlineTest exten
         path + "/testJPEG_EXIF.jpg.tar.gz",
         //path + "/jpeg2000.jp2",
     };
-    testDocumentTypesInternal(files, expectedRecords);
+    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
   }  
 
   @Test
@@ -89,13 +159,14 @@ public class SolrCellMorphlineTest exten
     String[] files = new String[] {
         path + "/testXML2.xml",
     };
-    testDocumentTypesInternal(files, expectedRecords);
+    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
   }  
 
   @Test
   public void testSolrCellDocumentTypes() throws Exception {
- 
-    morphline = createMorphline("test-morphlines/solrCellDocumentTypes");    
+    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
+    
+    morphline = createMorphline(new File(tempDir).getAbsolutePath() +  "/test-morphlines/solrCellDocumentTypes");    
     String path = RESOURCES_DIR + "/test-documents";
     String[] files = new String[] {
         path + "/testBMPfp.txt",
@@ -107,22 +178,26 @@ public class SolrCellMorphlineTest exten
         path + "/testJPEG_EXIF.jpg.gz",
         path + "/testJPEG_EXIF.jpg.tar.gz",
         path + "/testXML.xml",          
-//        path + "/cars.csv",
+        path + "/cars.csv",
 //        path + "/cars.tsv",
 //        path + "/cars.ssv",
-//        path + "/cars.csv.gz",
-//        path + "/cars.tar.gz",
+        path + "/cars.csv.gz",
+        path + "/cars.tar.gz",
         path + "/sample-statuses-20120906-141433.avro",
         path + "/sample-statuses-20120906-141433",
         path + "/sample-statuses-20120906-141433.gz",
         path + "/sample-statuses-20120906-141433.bz2",
+        path + "/email.eml",
     };
-    testDocumentTypesInternal(files, expectedRecords);
+    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
   }
   
   @Test
   public void testSolrCellDocumentTypes2() throws Exception {
-    morphline = createMorphline("test-morphlines/solrCellDocumentTypes");    
+
+    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
+    
+    morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");    
     String path = RESOURCES_DIR + "/test-documents";
     String[] files = new String[] {
         path + "/testPPT_various.ppt",
@@ -137,7 +212,7 @@ public class SolrCellMorphlineTest exten
         path + "/complex.mbox", 
         path + "/test-outlook.msg", 
         path + "/testEMLX.emlx",
-//        path + "/testRFC822",  
+        path + "/testRFC822",  
         path + "/rsstest.rss", 
 //        path + "/testDITA.dita", 
         
@@ -176,7 +251,7 @@ public class SolrCellMorphlineTest exten
 //        path + "/testWINMAIL.dat", 
 //        path + "/testWMF.wmf", 
     };   
-    testDocumentTypesInternal(files, expectedRecords);
+    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
   }
 
   /**

Modified: lucene/dev/trunk/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java (original)
+++ lucene/dev/trunk/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java Thu Dec 19 19:13:47 2013
@@ -20,11 +20,13 @@ import org.kitesdk.morphline.api.Morphli
 import org.kitesdk.morphline.api.MorphlineContext;
 import org.kitesdk.morphline.api.MorphlineRuntimeException;
 import org.kitesdk.morphline.base.Configs;
+
 import com.google.common.base.Preconditions;
 import com.typesafe.config.Config;
 import com.typesafe.config.ConfigFactory;
 import com.typesafe.config.ConfigRenderOptions;
 import com.typesafe.config.ConfigUtil;
+
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.impl.CloudSolrServer;
 import org.apache.solr.common.cloud.SolrZkClient;
@@ -39,8 +41,10 @@ import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
 import javax.xml.parsers.ParserConfigurationException;
+
 import java.io.File;
 import java.io.IOException;
+import java.net.MalformedURLException;
 
 /**
  * Set of configuration parameters that identify the location and schema of a Solr server or
@@ -57,8 +61,6 @@ public class SolrLocator {
   private String solrHomeDir;
   private int batchSize = 1000;
   
-  private static final String SOLR_HOME_PROPERTY_NAME = "solr.solr.home";
-
   private static final Logger LOG = LoggerFactory.getLogger(SolrLocator.class);
 
   protected SolrLocator(MorphlineContext context) {
@@ -120,7 +122,6 @@ public class SolrLocator {
     
     // If solrHomeDir isn't defined and zkHost and collectionName are defined 
     // then download schema.xml and solrconfig.xml, etc from zk and use that as solrHomeDir
-    String oldSolrHomeDir = null;
     String mySolrHomeDir = solrHomeDir;
     if (solrHomeDir == null || solrHomeDir.length() == 0) {
       if (zkHost == null || zkHost.length() == 0) {
@@ -150,20 +151,13 @@ public class SolrLocator {
       }
     }
     
-    oldSolrHomeDir = System.setProperty(SOLR_HOME_PROPERTY_NAME, mySolrHomeDir);
+    LOG.debug("SolrLocator loading IndexSchema from dir {}", mySolrHomeDir);
     try {
-      SolrConfig solrConfig = new SolrConfig(); // TODO use SolrResourceLoader ala TikaMapper?
-      // SolrConfig solrConfig = new SolrConfig("solrconfig.xml");
-      // SolrConfig solrConfig = new
-      // SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1",
-      // "solrconfig.xml", null);
-      // SolrConfig solrConfig = new
-      // SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1/conf/solrconfig.xml");
-      SolrResourceLoader loader = solrConfig.getResourceLoader();
-      
+      SolrResourceLoader loader = new SolrResourceLoader(mySolrHomeDir);
+      SolrConfig solrConfig = new SolrConfig(loader, "solrconfig.xml", null);
       InputSource is = new InputSource(loader.openSchema("schema.xml"));
-          is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml"));
-        
+      is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml"));
+      
       IndexSchema schema = new IndexSchema(solrConfig, "schema.xml", is);
       validateSchema(schema);
       return schema;
@@ -173,14 +167,6 @@ public class SolrLocator {
       throw new MorphlineRuntimeException(e);
     } catch (SAXException e) {
       throw new MorphlineRuntimeException(e);
-    } finally { // restore old global state
-      if (solrHomeDir != null) {
-        if (oldSolrHomeDir == null) {
-          System.clearProperty(SOLR_HOME_PROPERTY_NAME);
-        } else {
-          System.setProperty(SOLR_HOME_PROPERTY_NAME, oldSolrHomeDir);
-        }
-      }
     }
   }
   

Modified: lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java (original)
+++ lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java Thu Dec 19 19:13:47 2013
@@ -19,6 +19,7 @@ package org.apache.solr.morphlines.solr;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -28,7 +29,6 @@ import java.util.Map.Entry;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServer;
@@ -41,9 +41,6 @@ import org.apache.solr.util.ExternalPath
 import org.junit.After;
 import org.junit.Before;
 import org.junit.BeforeClass;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import org.kitesdk.morphline.api.Collector;
 import org.kitesdk.morphline.api.Command;
 import org.kitesdk.morphline.api.MorphlineContext;
@@ -53,6 +50,9 @@ import org.kitesdk.morphline.base.FaultT
 import org.kitesdk.morphline.base.Fields;
 import org.kitesdk.morphline.base.Notifications;
 import org.kitesdk.morphline.stdlib.PipeBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import com.codahale.metrics.MetricRegistry;
 import com.google.common.io.Files;
 import com.typesafe.config.Config;
@@ -73,6 +73,8 @@ public class AbstractSolrMorphlineTestBa
   protected static final AtomicInteger SEQ_NUM = new AtomicInteger();
   protected static final AtomicInteger SEQ_NUM2 = new AtomicInteger();
   
+  protected static final Object NON_EMPTY_FIELD = new Object();
+  
   private static final Logger LOGGER = LoggerFactory.getLogger(AbstractSolrMorphlineTestBase.class);
   
   protected String tempDir;
@@ -113,7 +115,7 @@ public class AbstractSolrMorphlineTestBa
     testServer = new SolrServerDocumentLoader(solrServer, batchSize);
     deleteAllDocuments();
     
-    tempDir = TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis();
+    tempDir = new File(TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis()).getAbsolutePath();
     new File(tempDir).mkdirs();
   }
   
@@ -124,7 +126,11 @@ public class AbstractSolrMorphlineTestBa
     super.tearDown();
   }
 
-  protected void testDocumentTypesInternal(String[] files, Map<String,Integer> expectedRecords) throws Exception {
+  protected void testDocumentTypesInternal(
+      String[] files, 
+      Map<String,Integer> expectedRecords, 
+      Map<String, Map<String, Object>> expectedRecordContents) throws Exception {
+    
     deleteAllDocuments();
     int numDocs = 0;    
     for (int i = 0; i < 1; i++) {
@@ -137,6 +143,7 @@ public class AbstractSolrMorphlineTestBa
         event.getFields().put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(body));
         event.getFields().put(Fields.ATTACHMENT_NAME, f.getName());
         event.getFields().put(Fields.BASE_ID, f.getName());        
+        collector.reset();
         load(event);
         Integer count = expectedRecords.get(file);
         if (count != null) {
@@ -145,6 +152,20 @@ public class AbstractSolrMorphlineTestBa
           numDocs++;
         }
         assertEquals("unexpected results in " + file, numDocs, queryResultSetSize("*:*"));
+        Map<String, Object> expectedContents = expectedRecordContents.get(file);
+        if (expectedContents != null) {
+          Record actual = collector.getFirstRecord();
+          for (Map.Entry<String, Object> entry : expectedContents.entrySet()) {
+            if (entry.getValue() == NON_EMPTY_FIELD) {
+              assertNotNull(entry.getKey());
+              assertTrue(actual.getFirstValue(entry.getKey()).toString().length() > 0);
+            } else if (entry.getValue() == null) {
+              assertEquals("key:" + entry.getKey(), 0, actual.get(entry.getKey()).size());
+            } else {
+              assertEquals("key:" + entry.getKey(), Arrays.asList(entry.getValue()), actual.get(entry.getKey()));
+            }
+          }
+        }
       }
     }
     assertEquals(numDocs, queryResultSetSize("*:*"));
@@ -180,17 +201,7 @@ public class AbstractSolrMorphlineTestBa
     s.commit();
   }
 
-  
-  public static void setupMorphline(String tempDir, String file) throws IOException {
-    String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
-    morphlineText = morphlineText.replace("RESOURCES_DIR", StringEscapeUtils.escapeJavaScript(new File(tempDir).getAbsolutePath()));
-    
-    FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
-  }
-  
   protected Command createMorphline(String file) throws IOException {
-    setupMorphline(tempDir, file);
-    
     return new PipeBuilder().build(parse(file), null, collector, createMorphlineContext());
   }
 
@@ -206,7 +217,13 @@ public class AbstractSolrMorphlineTestBa
   private Config parse(String file) throws IOException {
     SolrLocator locator = new SolrLocator(createMorphlineContext());
     locator.setSolrHomeDir(testSolrHome + "/collection1");
-    Config config = new Compiler().parse(new File(tempDir + "/" + file + ".conf"), locator.toConfig("SOLR_LOCATOR"));
+    File morphlineFile;
+    if (new File(file).isAbsolute()) {
+      morphlineFile = new File(file + ".conf");
+    } else {
+      morphlineFile = new File(RESOURCES_DIR + "/" + file + ".conf");
+    }
+    Config config = new Compiler().parse(morphlineFile, locator.toConfig("SOLR_LOCATOR"));
     config = config.getConfigList("morphlines").get(0);
     return config;
   }
@@ -266,4 +283,15 @@ public class AbstractSolrMorphlineTestBa
     public HashSet<String> getFieldValues() { return fieldValues; }
     public CompareType getCompareType() { return compareType; }
   }
+  
+  public static void setupMorphline(String tempDir, String file, boolean replaceSolrLocator) throws IOException {
+    String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
+    morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath());
+    if (replaceSolrLocator) {
+      morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}",
+          "{ collection : collection1 }");
+    }
+    new File(tempDir + "/" + file + ".conf").getParentFile().mkdirs();
+    FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
+  }
 }

Modified: lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java?rev=1552398&r1=1552397&r2=1552398&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java Thu Dec 19 19:13:47 2013
@@ -57,18 +57,21 @@ public class SolrMorphlineTest extends A
     
   @Test
   public void testTokenizeText() throws Exception {
-    morphline = createMorphline("test-morphlines/tokenizeText");    
-    Record record = new Record();
-    record.put(Fields.MESSAGE, "Hello World!");
-    record.put(Fields.MESSAGE, "\nFoo@Bar.com #%()123");
-    Record expected = record.copy();
-    expected.getFields().putAll("tokens", Arrays.asList("hello", "world", "foo", "bar.com", "123"));
-    startSession();
-    Notifications.notifyBeginTransaction(morphline);
-    assertTrue(morphline.process(record));
-    assertEquals(1, collector.getNumStartEvents());
-    Notifications.notifyCommitTransaction(morphline);
-    assertEquals(expected, collector.getFirstRecord());
+    morphline = createMorphline("test-morphlines/tokenizeText");
+    for (int i = 0; i < 3; i++) {
+      Record record = new Record();
+      record.put(Fields.MESSAGE, "Hello World!");
+      record.put(Fields.MESSAGE, "\nFoo@Bar.com #%()123");
+      Record expected = record.copy();
+      expected.getFields().putAll("tokens", Arrays.asList("hello", "world", "foo", "bar.com", "123"));
+      collector.reset();
+      startSession();
+      Notifications.notifyBeginTransaction(morphline);
+      assertTrue(morphline.process(record));
+      assertEquals(1, collector.getNumStartEvents());
+      Notifications.notifyCommitTransaction(morphline);
+      assertEquals(expected, collector.getFirstRecord());
+    }
   }
     
 }