You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by no...@apache.org on 2015/04/30 16:23:46 UTC
svn commit: r1677004 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/contrib/
solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/
solr/contrib/dataimporth...
Author: noble
Date: Thu Apr 30 14:23:46 2015
New Revision: 1677004
URL: http://svn.apache.org/r1677004
Log:
SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
Added:
lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_jpeg.jpg
- copied unchanged from r1677001, lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_jpeg.jpg
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/contrib/ (props changed)
lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Thu Apr 30 14:23:46 2015
@@ -103,6 +103,9 @@ New Features
& json.facet={ colors:{type:terms, field:color, excludeTags=COLOR} }
(yonik)
+* SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
+ (Tim Allison via Noble Paul)
+
Bug Fixes
----------------------
Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Thu Apr 30 14:23:46 2015
@@ -45,6 +45,7 @@ import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
+import java.util.Locale;
import java.util.Map;
import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
@@ -55,6 +56,10 @@ import static org.apache.solr.handler.da
* <p>An implementation of {@link EntityProcessor} which reads data from rich docs
* using <a href="http://tika.apache.org/">Apache Tika</a>
*
+ * <p>To index latitude/longitude data that might
+ * be extracted from a file's metadata, identify
+ * the geo field for this information with this attribute:
+ * <code>spatialMetadataField</code>
*
* @since solr 3.1
*/
@@ -67,6 +72,7 @@ public class TikaEntityProcessor extends
private String parser;
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
private String htmlMapper;
+ private String spatialMetadataField;
@Override
public void init(Context context) {
@@ -113,6 +119,8 @@ public class TikaEntityProcessor extends
if(parser == null) {
parser = AUTO_PARSER;
}
+
+ spatialMetadataField = context.getResolvedEntityAttribute("spatialMetadataField");
}
@Override
@@ -167,10 +175,20 @@ public class TikaEntityProcessor extends
if (s != null) row.put(col, s);
}
if(!"none".equals(format) ) row.put("text", sw.toString());
+ tryToAddLatLon(metadata, row);
done = true;
return row;
}
+ private void tryToAddLatLon(Metadata metadata, Map<String, Object> row) {
+ if (spatialMetadataField == null) return;
+ String latString = metadata.get(Metadata.LATITUDE);
+ String lonString = metadata.get(Metadata.LONGITUDE);
+ if (latString != null && lonString != null) {
+ row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString));
+ }
+ }
+
private static ContentHandler getHtmlHandler(Writer writer)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory)
Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml Thu Apr 30 14:23:46 2015
@@ -77,6 +77,8 @@
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="latLon" class="solr.LatLonType" subFieldType="double"/>
+
<!--
Numeric field types that index each value at various levels of precision
@@ -199,7 +201,8 @@
<field name="title" type="string" indexed="true" stored="true"/>
<field name="author" type="string" indexed="true" stored="true" />
<field name="text" type="text" indexed="true" stored="true" />
-
+ <field name="foo_i" type="int" indexed="true" stored="false" />
+ <field name="home" type="latLon" indexed="true" stored="true" />
</fields>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>text</defaultSearchField>
Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java?rev=1677004&r1=1677003&r2=1677004&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java Thu Apr 30 14:23:46 2015
@@ -51,6 +51,18 @@ public class TestTikaEntityProcessor ext
" </document>" +
"</dataConfig>";
+ private String spatialConf =
+ "<dataConfig>" +
+ " <dataSource type=\"BinFileDataSource\"/>" +
+ " <document>" +
+ " <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" +
+ getFile("dihextras/test_jpeg.jpg").getAbsolutePath() + "\" spatialMetadataField=\"home\">" +
+ " <field column=\"text\"/>" +
+ " </entity>" +
+ " </document>" +
+ "</dataConfig>";
+
+
private String[] tests = {
"//*[@numFound='1']"
,"//str[@name='author'][.='Grant Ingersoll']"
@@ -74,6 +86,10 @@ public class TestTikaEntityProcessor ext
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
};
+ private String[] testsSpatial = {
+ "//*[@numFound='1']"
+ };
+
private String[] testsEmbedded = {
"//*[@numFound='1']",
"//str[@name='text'][contains(.,'When in the Course')]"
@@ -121,6 +137,16 @@ public class TestTikaEntityProcessor ext
assertQ(req("*:*"), testsHTMLIdentity);
}
+ @Test
+ public void testTikaGeoMetadata() throws Exception {
+ runFullImport(spatialConf);
+ String pt = "38.97,-77.018";
+ Double distance = 5.0d;
+ assertQ(req("q", "*:* OR foo_i:" + random().nextInt(100), "fq",
+ "{!geofilt sfield=\"home\"}\"",
+ "pt", pt, "d", String.valueOf(distance)), testsSpatial);
+ }
+
private String getConfigHTML(String htmlMapper) {
return
"<dataConfig>" +