You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/30 13:53:54 UTC
[tika] branch branch_1x updated: TIKA-3078 -- add configurability
to GeoParser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 7565e15 TIKA-3078 -- add configurability to GeoParser
7565e15 is described below
commit 7565e15aeb8eb0f3cfb70045e1ca452ee32db588
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 30 09:52:11 2020 -0400
TIKA-3078 -- add configurability to GeoParser
---
.../src/test/java/org/apache/tika/TikaTest.java | 16 +++++++
.../apache/tika/parser/geo/topic/GeoParser.java | 49 +++++++++++++++++-----
.../tika/parser/geo/topic/GeoParserConfig.java | 1 +
.../tika/parser/geo/topic/GeoParserTest.java | 24 ++++++++---
.../tika/parser/ocr/TesseractOCRParserTest.java | 13 ------
.../tika/config/TIKA-3078-geo.topic.GeoParser.xml | 30 +++++++++++++
6 files changed, 105 insertions(+), 28 deletions(-)
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index ac1ef3c..efb93b7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -43,6 +43,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
@@ -474,4 +475,19 @@ public abstract class TikaTest {
}
}
}
+
+ public static Parser findParser(Parser parser, Class clazz) {
+ if (parser instanceof CompositeParser) {
+ for (Parser child : ((CompositeParser)parser).getAllComponentParsers()) {
+ Parser found = findParser(child, clazz);
+ if (found != null) {
+ return found;
+ }
+ }
+ } else if (clazz.isInstance(parser)) {
+ return parser;
+ }
+ return null;
+ }
+
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
index 3803d44..5ca55bf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.geo.topic;
import java.io.IOException;
import java.io.InputStream;
+import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
@@ -27,6 +28,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
+import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -50,7 +52,7 @@ public class GeoParser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MEDIA_TYPE);
- private GeoParserConfig config = new GeoParserConfig();
+ private GeoParserConfig defaultConfig = new GeoParserConfig();
private GeoGazetteerClient gazetteerClient;
private boolean initialized;
@@ -65,9 +67,9 @@ public class GeoParser extends AbstractParser {
/**
* Initializes this parser
- * @param modelUrl the URL to NER model
+ * @param geoParserConfig config to load the url model from and set the gazetteer client
*/
- public void initialize(URL modelUrl) {
+ public void initialize(GeoParserConfig geoParserConfig) {
try {
if (this.modelUrl != null && this.modelUrl.toURI().equals(modelUrl.toURI())) {
return;
@@ -76,8 +78,8 @@ public class GeoParser extends AbstractParser {
throw new RuntimeException(e1.getMessage());
}
- this.modelUrl = modelUrl;
- gazetteerClient = new GeoGazetteerClient(config);
+ this.modelUrl = geoParserConfig.getNerModelUrl();
+ gazetteerClient = new GeoGazetteerClient(geoParserConfig);
// Check if the NER model is available, and if the
// lucene-geo-gazetteer is available
@@ -102,9 +104,9 @@ public class GeoParser extends AbstractParser {
/*----------------configure this parser by ParseContext Object---------------------*/
- this.config = context.get(GeoParserConfig.class, config);
- initialize(this.config.getNerModelUrl());
- if (!isAvailable()) {
+ GeoParserConfig geoParserConfig = context.get(GeoParserConfig.class, defaultConfig);
+ initialize(geoParserConfig);
+ if (!isAvailable(geoParserConfig)) {
return;
}
NameEntityExtractor extractor = null;
@@ -149,10 +151,37 @@ public class GeoParser extends AbstractParser {
return gazetteerClient.getLocations(locationNameEntities);
}
- public boolean isAvailable() {
+ public boolean isAvailable(GeoParserConfig geoParserConfig) {
if (!initialized) {
- initialize(config.getNerModelUrl());
+ initialize(geoParserConfig);
}
return this.available;
}
+
+ @Field
+ public void setGazetteerRestEndpoint(String gazetteerRestEndpoint) {
+ defaultConfig.setGazetteerRestEndpoint(gazetteerRestEndpoint);
+ }
+
+ /**
+ *
+ * @param nerModelUrl url for the NER model
+ * @throws IllegalArgumentException for a malformed URL
+ */
+ @Field
+ public void setNerModelUrl(String nerModelUrl) {
+ try {
+ defaultConfig.setNerModelUrl(new URL(nerModelUrl));
+ } catch (MalformedURLException e) {
+ throw new IllegalArgumentException("malformed url "+nerModelUrl, e);
+ }
+ }
+
+ public String getGazetteerRestEndpoint() {
+ return defaultConfig.getGazetteerRestEndpoint();
+ }
+
+ public URL getNerModelUrl() {
+ return defaultConfig.getNerModelUrl();
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
index b3ee246..df85479 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
@@ -99,4 +99,5 @@ public class GeoParserConfig implements Serializable {
public void setGazetteerRestEndpoint(String gazetteerRestEndpoint) {
this.gazetteerRestEndpoint = gazetteerRestEndpoint;
}
+
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
index d642cf2..f936367 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
@@ -21,11 +21,15 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import java.net.URL;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -34,12 +38,11 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
-public class GeoParserTest {
+public class GeoParserTest extends TikaTest {
private Parser geoparser = new GeoParser();
@Test
- public void testFunctions() throws UnsupportedEncodingException,
- IOException, SAXException, TikaException {
+ public void testFunctions() throws IOException, SAXException, TikaException {
String text = "The millennial-scale cooling trend that followed the HTM coincides with the decrease in China "
+ "summer insolation driven by slow changes in Earth's orbit. Despite the nearly linear forcing, the transition from the HTM to "
+ "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand how feedbacks and perturbations result in rapid changes, "
@@ -55,7 +58,7 @@ public class GeoParserTest {
InputStream s = new ByteArrayInputStream(text.getBytes(UTF_8));
/* if it's not available no tests to run */
- if (!((GeoParser) geoparser).isAvailable())
+ if (!((GeoParser) geoparser).isAvailable(config))
return;
geoparser.parse(s, new BodyContentHandler(), metadata, context);
@@ -73,7 +76,7 @@ public class GeoParserTest {
}
@Test
- public void testNulls() throws UnsupportedEncodingException, IOException,
+ public void testNulls() throws IOException,
SAXException, TikaException {
String text = "";
@@ -88,4 +91,15 @@ public class GeoParserTest {
assertNull(metadata.get("Geographic_LATITUDE"));
}
+
+ @Test
+ public void testConfig() throws Exception {
+ TikaConfig config = new TikaConfig(getResourceAsStream(
+ "/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml"));
+ Parser p = config.getParser();
+ GeoParser geoParser = (GeoParser)findParser(p, org.apache.tika.parser.geo.topic.GeoParser.class);
+ assertNotNull(geoParser);
+ assertEquals("http://localhost/gazetteerRestEndpoint", geoParser.getGazetteerRestEndpoint());
+ assertEquals(new URL("file:/ner/model/url"), geoParser.getNerModelUrl());
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index b9b9504..12419ee 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -319,17 +319,4 @@ public class TesseractOCRParserTest extends TikaTest {
assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
}
- private Parser findParser(Parser parser, Class clazz) {
- if (parser instanceof CompositeParser) {
- for (Parser child : ((CompositeParser)parser).getAllComponentParsers()) {
- Parser found = findParser(child, clazz);
- if (found != null) {
- return found;
- }
- }
- } else if (clazz.isInstance(parser)) {
- return parser;
- }
- return null;
- }
}
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml
new file mode 100644
index 0000000..3b9df0b
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.geo.topic.GeoParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.geo.topic.GeoParser">
+ <params>
+ <param name="gazetteerRestEndpoint" type="string">http://localhost/gazetteerRestEndpoint</param>
+ <param name="nerModelUrl" type="string">file:/ner/model/url</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>