You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/11/15 20:53:14 UTC
svn commit: r1714492 - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic:
GeoParser.java GeoParserConfig.java NameEntityExtractor.java
Author: nick
Date: Sun Nov 15 19:53:14 2015
New Revision: 1714492
URL: http://svn.apache.org/viewvc?rev=1714492&view=rev
Log:
TIKA-1791 GeoParser fix for models in a jar file, from Thamme Gowda N. This closes #63 from GitHub
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Sun Nov 15 19:53:14 2015
@@ -20,6 +20,7 @@ package org.apache.tika.parser.geo.topic
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -45,36 +46,64 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class GeoParser extends AbstractParser {
-
private static final long serialVersionUID = -2241391757440215491L;
- private static final MediaType MEDIA_TYPE = MediaType
- .application("geotopic");
- private static final Set<MediaType> SUPPORTED_TYPES = Collections
- .singleton(MEDIA_TYPE);
+ private static final Logger LOG = Logger.getLogger(GeoParser.class.getName());
+ private static final MediaType MEDIA_TYPE =
+ MediaType.application("geotopic");
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MEDIA_TYPE);
private GeoParserConfig config = new GeoParserConfig();
- private static final Logger LOG = Logger.getLogger(GeoParser.class
- .getName());
+
+ private boolean initialized;
+ private URL modelUrl;
+ private NameEntityExtractor extractor;
+ private boolean available;
@Override
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return SUPPORTED_TYPES;
}
+ /**
+ * Initializes this parser
+ * @param modelUrl the URL to NER model
+ */
+ public void initialize(URL modelUrl) {
+
+ if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
+ //previously initialized for the same URL
+ return;
+ }
+ this.modelUrl = modelUrl;
+ //if NER model is available and lucene-geo-gazetteer is available
+ this.available = modelUrl != null &&
+ ExternalParser.check(new String[] { "lucene-geo-gazetteer", "--help" }, -1);
+ if (this.available) {
+ try {
+ this.extractor = new NameEntityExtractor(modelUrl);
+ } catch (Exception e) {
+ e.printStackTrace();
+ this.available = false;
+ }
+ }
+ initialized = true;
+
+ }
+
@Override
public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
+ Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
/*----------------configure this parser by ParseContext Object---------------------*/
- config = context.get(GeoParserConfig.class, config);
- String nerModelPath = config.getNERPath();
+ this.config = context.get(GeoParserConfig.class, config);
+ initialize(this.config.getNerModelUrl());
if (!isAvailable()) {
return;
}
/*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
- NameEntityExtractor extractor = new NameEntityExtractor(nerModelPath);
extractor.getAllNameEntitiesfromInput(stream);
extractor.getBestNameEntity();
ArrayList<String> locationNameEntities = extractor.locationNameEntities;
@@ -146,10 +175,10 @@ public class GeoParser extends AbstractP
}
public boolean isAvailable() {
- return ExternalParser.check(new String[] { "lucene-geo-gazetteer",
- "--help" }, -1)
- && config.getNERPath() != null
- && !config.getNERPath().equals("");
+ if (!initialized) {
+ initialize(config.getNerModelUrl());
+ }
+ return this.available;
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java Sun Nov 15 19:53:14 2015
@@ -19,22 +19,16 @@ package org.apache.tika.parser.geo.topic
import java.io.File;
import java.io.Serializable;
-import java.net.URISyntaxException;
+import java.net.MalformedURLException;
+import java.net.URL;
public class GeoParserConfig implements Serializable {
- private static final long serialVersionUID = 1L;
- private String nerModelPath = null;
+ private static final long serialVersionUID = 2L;
+ private URL nerModelUrl = null;
public GeoParserConfig() {
- try {
- if (GeoParserConfig.class.getResource("en-ner-location.bin") != null) {
- this.nerModelPath = new File(GeoParserConfig.class.getResource(
- "en-ner-location.bin").toURI()).getAbsolutePath();
- }
- } catch (URISyntaxException e) {
- e.printStackTrace();
- }
+ this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
}
public void setNERModelPath(String path) {
@@ -44,11 +38,19 @@ public class GeoParserConfig implements
if (file.isDirectory() || !file.exists()) {
return;
}
- nerModelPath = path;
+ try {
+ this.nerModelUrl = file.toURI().toURL();
+ } catch (MalformedURLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void setNerModelUrl(URL url) {
+ this.nerModelUrl = url;
}
- public String getNERPath() {
- return nerModelPath;
+ public URL getNerModelUrl() {
+ return nerModelUrl;
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Sun Nov 15 19:53:14 2015
@@ -17,9 +17,10 @@
package org.apache.tika.parser.geo.topic;
-import java.io.FileInputStream;
+
import java.io.IOException;
import java.io.InputStream;
+import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -30,25 +31,24 @@ import java.util.Map;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
-
import org.apache.commons.io.IOUtils;
import static java.nio.charset.StandardCharsets.UTF_8;
public class NameEntityExtractor {
- private String nerModelPath = null;
+
ArrayList<String> locationNameEntities;
String bestNameEntity;
private HashMap<String, Integer> tf;
+ private final NameFinderME nameFinder;
- public NameEntityExtractor(String nerModelpath) {
+ public NameEntityExtractor(URL modelUrl) throws IOException {
this.locationNameEntities = new ArrayList<String>();
this.bestNameEntity = null;
- this.nerModelPath = nerModelpath;
- tf = new HashMap<String, Integer>();
-
+ TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
+ this.nameFinder = new NameFinderME(model);
+ this.tf = new HashMap<String, Integer>();
}
/*
@@ -60,18 +60,20 @@ public class NameEntityExtractor {
*/
public void getAllNameEntitiesfromInput(InputStream stream)
- throws InvalidFormatException, IOException {
+ throws IOException {
- InputStream modelIn = new FileInputStream(nerModelPath);
- TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
- NameFinderME nameFinder = new NameFinderME(model);
- String[] in = IOUtils.toString(stream, UTF_8).split(" ");
- Span nameE[] = nameFinder.find(in);
+ String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+ Span nameE[];
+ //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+ synchronized (nameFinder) {
+ nameE = nameFinder.find(in);
+ //the same name finder is reused, so clear adaptive data
+ nameFinder.clearAdaptiveData();
+ }
String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
spanNames = spanNames.substring(1, spanNames.length() - 1);
- modelIn.close();
String[] tmp = spanNames.split(",");
for (String name : tmp) {
@@ -79,6 +81,7 @@ public class NameEntityExtractor {
this.locationNameEntities.add(name);
}
+
}
/*
@@ -123,5 +126,4 @@ public class NameEntityExtractor {
}
}
}
-
}