You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/30 13:53:54 UTC

[tika] branch branch_1x updated: TIKA-3078 -- add configurability to GeoParser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 7565e15  TIKA-3078 -- add configurability to GeoParser
7565e15 is described below

commit 7565e15aeb8eb0f3cfb70045e1ca452ee32db588
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 30 09:52:11 2020 -0400

    TIKA-3078 -- add configurability to GeoParser
---
 .../src/test/java/org/apache/tika/TikaTest.java    | 16 +++++++
 .../apache/tika/parser/geo/topic/GeoParser.java    | 49 +++++++++++++++++-----
 .../tika/parser/geo/topic/GeoParserConfig.java     |  1 +
 .../tika/parser/geo/topic/GeoParserTest.java       | 24 ++++++++---
 .../tika/parser/ocr/TesseractOCRParserTest.java    | 13 ------
 .../tika/config/TIKA-3078-geo.topic.GeoParser.xml  | 30 +++++++++++++
 6 files changed, 105 insertions(+), 28 deletions(-)

diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index ac1ef3c..efb93b7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -43,6 +43,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
@@ -474,4 +475,19 @@ public abstract class TikaTest {
             }
         }
     }
+
+    public static Parser findParser(Parser parser, Class clazz) {
+        if (parser instanceof CompositeParser) {
+            for (Parser child : ((CompositeParser)parser).getAllComponentParsers()) {
+                Parser found = findParser(child, clazz);
+                if (found != null) {
+                    return found;
+                }
+            }
+        } else if (clazz.isInstance(parser)) {
+            return parser;
+        }
+        return null;
+    }
+
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
index 3803d44..5ca55bf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.geo.topic;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.MalformedURLException;
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.util.ArrayList;
@@ -27,6 +28,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -50,7 +52,7 @@ public class GeoParser extends AbstractParser {
     private static final Set<MediaType> SUPPORTED_TYPES = 
                                     Collections.singleton(MEDIA_TYPE);
     
-    private GeoParserConfig config = new GeoParserConfig();
+    private GeoParserConfig defaultConfig = new GeoParserConfig();
     private GeoGazetteerClient gazetteerClient;
     
     private boolean initialized;
@@ -65,9 +67,9 @@ public class GeoParser extends AbstractParser {
 
     /**
      * Initializes this parser
-     * @param modelUrl the URL to NER model
+     * @param geoParserConfig config to load the url model from and set the gazetteer client
      */
-    public void initialize(URL modelUrl) {
+    public void initialize(GeoParserConfig geoParserConfig) {
         try {
           if (this.modelUrl != null && this.modelUrl.toURI().equals(modelUrl.toURI())) {
               return;
@@ -76,8 +78,8 @@ public class GeoParser extends AbstractParser {
               throw new RuntimeException(e1.getMessage());
         }
         
-        this.modelUrl = modelUrl;
-        gazetteerClient = new GeoGazetteerClient(config);
+        this.modelUrl = geoParserConfig.getNerModelUrl();
+        gazetteerClient = new GeoGazetteerClient(geoParserConfig);
         
         // Check if the NER model is available, and if the
         //  lucene-geo-gazetteer is available
@@ -102,9 +104,9 @@ public class GeoParser extends AbstractParser {
 
         /*----------------configure this parser by ParseContext Object---------------------*/
 
-        this.config = context.get(GeoParserConfig.class, config);
-        initialize(this.config.getNerModelUrl());
-        if (!isAvailable()) {
+        GeoParserConfig geoParserConfig = context.get(GeoParserConfig.class, defaultConfig);
+        initialize(geoParserConfig);
+        if (!isAvailable(geoParserConfig)) {
             return;
         }
         NameEntityExtractor extractor = null;
@@ -149,10 +151,37 @@ public class GeoParser extends AbstractParser {
     	return gazetteerClient.getLocations(locationNameEntities);
     }
 
-    public boolean isAvailable() {
+    public boolean isAvailable(GeoParserConfig geoParserConfig) {
         if (!initialized) {
-            initialize(config.getNerModelUrl());
+            initialize(geoParserConfig);
         }
         return this.available;
     }
+
+    @Field
+    public void setGazetteerRestEndpoint(String gazetteerRestEndpoint) {
+        defaultConfig.setGazetteerRestEndpoint(gazetteerRestEndpoint);
+    }
+
+    /**
+     *
+     * @param nerModelUrl url for the NER model
+     * @throws IllegalArgumentException for a malformed URL
+     */
+    @Field
+    public void setNerModelUrl(String nerModelUrl) {
+        try {
+            defaultConfig.setNerModelUrl(new URL(nerModelUrl));
+        } catch (MalformedURLException e) {
+            throw new IllegalArgumentException("malformed url "+nerModelUrl, e);
+        }
+    }
+
+    public String getGazetteerRestEndpoint() {
+        return defaultConfig.getGazetteerRestEndpoint();
+    }
+
+    public URL getNerModelUrl() {
+        return defaultConfig.getNerModelUrl();
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
index b3ee246..df85479 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
@@ -99,4 +99,5 @@ public class GeoParserConfig implements Serializable {
     public void setGazetteerRestEndpoint(String gazetteerRestEndpoint) {
 		this.gazetteerRestEndpoint = gazetteerRestEndpoint;
 	}
+
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
index d642cf2..f936367 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
@@ -21,11 +21,15 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.junit.Test;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
+import java.net.URL;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -34,12 +38,11 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.SAXException;
 
-public class GeoParserTest {
+public class GeoParserTest extends TikaTest {
 	private Parser geoparser = new GeoParser();
 
 	@Test
-	public void testFunctions() throws UnsupportedEncodingException,
-			IOException, SAXException, TikaException {
+	public void testFunctions() throws IOException, SAXException, TikaException {
 		String text = "The millennial-scale cooling trend that followed the HTM coincides with the decrease in China "
 				+ "summer insolation driven by slow changes in Earth's orbit. Despite the nearly linear forcing, the transition from the HTM to "
 				+ "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand how feedbacks and perturbations result in rapid changes, "
@@ -55,7 +58,7 @@ public class GeoParserTest {
 
 		InputStream s = new ByteArrayInputStream(text.getBytes(UTF_8));
 		/* if it's not available no tests to run */
-		if (!((GeoParser) geoparser).isAvailable())
+		if (!((GeoParser) geoparser).isAvailable(config))
 			return;
 
 		geoparser.parse(s, new BodyContentHandler(), metadata, context);
@@ -73,7 +76,7 @@ public class GeoParserTest {
 	}
 
 	@Test
-	public void testNulls() throws UnsupportedEncodingException, IOException,
+	public void testNulls() throws IOException,
 			SAXException, TikaException {
 		String text = "";
 
@@ -88,4 +91,15 @@ public class GeoParserTest {
 		assertNull(metadata.get("Geographic_LATITUDE"));
 
 	}
+
+	@Test
+	public void testConfig() throws Exception {
+		TikaConfig config = new TikaConfig(getResourceAsStream(
+				"/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml"));
+		Parser p = config.getParser();
+		GeoParser geoParser = (GeoParser)findParser(p, org.apache.tika.parser.geo.topic.GeoParser.class);
+		assertNotNull(geoParser);
+		assertEquals("http://localhost/gazetteerRestEndpoint", geoParser.getGazetteerRestEndpoint());
+		assertEquals(new URL("file:/ner/model/url"), geoParser.getNerModelUrl());
+	}
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index b9b9504..12419ee 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -319,17 +319,4 @@ public class TesseractOCRParserTest extends TikaTest {
         assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
     }
 
-    private Parser findParser(Parser parser, Class clazz) {
-        if (parser instanceof CompositeParser) {
-            for (Parser child : ((CompositeParser)parser).getAllComponentParsers()) {
-                Parser found = findParser(child, clazz);
-                if (found != null) {
-                    return found;
-                }
-            }
-        } else if (clazz.isInstance(parser)) {
-            return parser;
-        }
-        return null;
-    }
 }
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml
new file mode 100644
index 0000000..3b9df0b
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.geo.topic.GeoParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.geo.topic.GeoParser">
+      <params>
+        <param name="gazetteerRestEndpoint" type="string">http://localhost/gazetteerRestEndpoint</param>
+        <param name="nerModelUrl" type="string">file:/ner/model/url</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>