You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by no...@apache.org on 2015/04/30 16:23:46 UTC

svn commit: r1677004 - in /lucene/dev/branches/branch_5x: ./ solr/ solr/contrib/ solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/ solr/contrib/dataimporthandler-extras/src/test-files/dihextras/ solr/contrib/dataimporth...

Author: noble
Date: Thu Apr 30 14:23:46 2015
New Revision: 1677004

URL: http://svn.apache.org/r1677004
Log:
SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata

Added:
    lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_jpeg.jpg
      - copied unchanged from r1677001, lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_jpeg.jpg
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
    lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
    lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java

Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Thu Apr 30 14:23:46 2015
@@ -103,6 +103,9 @@ New Features
   & json.facet={ colors:{type:terms, field:color, excludeTags=COLOR} } 
   (yonik)
 
+* SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
+  (Tim Allison via Noble Paul)
+
 
 Bug Fixes
 ----------------------

Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Thu Apr 30 14:23:46 2015
@@ -45,6 +45,7 @@ import java.io.InputStream;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;
 
 import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
@@ -55,6 +56,10 @@ import static org.apache.solr.handler.da
  * <p>An implementation of {@link EntityProcessor} which reads data from rich docs
  * using <a href="http://tika.apache.org/">Apache Tika</a>
  *
+ * <p>To index latitude/longitude data that might
+ * be extracted from a file's metadata, identify
+ * the geo field for this information with this attribute:
+ * <code>spatialMetadataField</code>
  *
  * @since solr 3.1
  */
@@ -67,6 +72,7 @@ public class TikaEntityProcessor extends
   private String parser;
   static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
   private String htmlMapper;
+  private String spatialMetadataField;
 
   @Override
   public void init(Context context) {
@@ -113,6 +119,8 @@ public class TikaEntityProcessor extends
     if(parser == null) {
       parser = AUTO_PARSER;
     }
+
+    spatialMetadataField = context.getResolvedEntityAttribute("spatialMetadataField");
   }
 
   @Override
@@ -167,10 +175,20 @@ public class TikaEntityProcessor extends
       if (s != null) row.put(col, s);
     }
     if(!"none".equals(format) ) row.put("text", sw.toString());
+    tryToAddLatLon(metadata, row);
     done = true;
     return row;
   }
 
+  private void tryToAddLatLon(Metadata metadata, Map<String, Object> row) {
+    if (spatialMetadataField == null) return;
+    String latString = metadata.get(Metadata.LATITUDE);
+    String lonString = metadata.get(Metadata.LONGITUDE);
+    if (latString != null && lonString != null) {
+      row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString));
+    }
+  }
+
   private static ContentHandler getHtmlHandler(Writer writer)
           throws TransformerConfigurationException {
     SAXTransformerFactory factory = (SAXTransformerFactory)

Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml Thu Apr 30 14:23:46 2015
@@ -77,6 +77,8 @@
     <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
     <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
     <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
+    <fieldType name="latLon" class="solr.LatLonType" subFieldType="double"/>
+
 
     <!--
      Numeric field types that index each value at various levels of precision
@@ -199,7 +201,8 @@
    <field name="title" type="string" indexed="true" stored="true"/>
    <field name="author" type="string" indexed="true" stored="true" />
    <field name="text" type="text" indexed="true" stored="true" />
-   
+   <field name="foo_i" type="int" indexed="true" stored="false" />
+   <field name="home" type="latLon" indexed="true" stored="true" />
  </fields>
  <!-- field for the QueryParser to use when an explicit fieldname is absent -->
  <defaultSearchField>text</defaultSearchField>

Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java Thu Apr 30 14:23:46 2015
@@ -51,6 +51,18 @@ public class TestTikaEntityProcessor ext
           "  </document>" +
           "</dataConfig>";
 
+  private String spatialConf =
+      "<dataConfig>" +
+          "  <dataSource type=\"BinFileDataSource\"/>" +
+          "  <document>" +
+          "    <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" +
+          getFile("dihextras/test_jpeg.jpg").getAbsolutePath() + "\" spatialMetadataField=\"home\">" +
+          "      <field column=\"text\"/>" +
+          "     </entity>" +
+          "  </document>" +
+          "</dataConfig>";
+
+
   private String[] tests = {
       "//*[@numFound='1']"
       ,"//str[@name='author'][.='Grant Ingersoll']"
@@ -74,6 +86,10 @@ public class TestTikaEntityProcessor ext
       , "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
   };
 
+  private String[] testsSpatial = {
+      "//*[@numFound='1']"
+  };
+
   private String[] testsEmbedded = {
       "//*[@numFound='1']",
       "//str[@name='text'][contains(.,'When in the Course')]"
@@ -121,6 +137,16 @@ public class TestTikaEntityProcessor ext
     assertQ(req("*:*"), testsHTMLIdentity);
   }
 
+  @Test
+  public void testTikaGeoMetadata() throws Exception {
+    runFullImport(spatialConf);
+    String pt = "38.97,-77.018";
+    Double distance = 5.0d;
+    assertQ(req("q", "*:* OR foo_i:" + random().nextInt(100), "fq",
+        "{!geofilt sfield=\"home\"}\"",
+        "pt", pt, "d", String.valueOf(distance)), testsSpatial);
+  }
+
   private String getConfigHTML(String htmlMapper) {
     return
         "<dataConfig>" +