You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/11/15 20:53:14 UTC

svn commit: r1714492 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic: GeoParser.java GeoParserConfig.java NameEntityExtractor.java

Author: nick
Date: Sun Nov 15 19:53:14 2015
New Revision: 1714492

URL: http://svn.apache.org/viewvc?rev=1714492&view=rev
Log:
TIKA-1791 GeoParser fix for models in a jar file, from Thamme Gowda N. This closes #63 from GitHub

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Sun Nov 15 19:53:14 2015
@@ -20,6 +20,7 @@ package org.apache.tika.parser.geo.topic
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -45,36 +46,64 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 public class GeoParser extends AbstractParser {
-
 	private static final long serialVersionUID = -2241391757440215491L;
-	private static final MediaType MEDIA_TYPE = MediaType
-			.application("geotopic");
-	private static final Set<MediaType> SUPPORTED_TYPES = Collections
-			.singleton(MEDIA_TYPE);
+        private static final Logger LOG = Logger.getLogger(GeoParser.class.getName());
+	private static final MediaType MEDIA_TYPE = 
+	                            MediaType.application("geotopic");
+	private static final Set<MediaType> SUPPORTED_TYPES = 
+	                            Collections.singleton(MEDIA_TYPE);
 	private GeoParserConfig config = new GeoParserConfig();
-	private static final Logger LOG = Logger.getLogger(GeoParser.class
-			.getName());
+
+	private boolean initialized;
+	private URL modelUrl;
+	private NameEntityExtractor extractor;
+	private boolean available;
 
 	@Override
 	public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
 		return SUPPORTED_TYPES;
 	}
 
+	/**
+	 * Initializes this parser
+	 * @param modelUrl the URL to NER model
+	 */
+	public void initialize(URL modelUrl) {
+
+		if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
+			//previously initialized for the same URL
+			return;
+		}
+		this.modelUrl = modelUrl;
+		//if NER model is available and lucene-geo-gazetteer is available
+		this.available = modelUrl != null &&
+				ExternalParser.check(new String[] { "lucene-geo-gazetteer", "--help" }, -1);
+		if (this.available) {
+			try {
+				this.extractor = new NameEntityExtractor(modelUrl);
+			} catch (Exception e) {
+				e.printStackTrace();
+				this.available = false;
+			}
+		}
+		initialized = true;
+
+	}
+
 	@Override
 	public void parse(InputStream stream, ContentHandler handler,
-			Metadata metadata, ParseContext context) throws IOException,
+					  Metadata metadata, ParseContext context) throws IOException,
 			SAXException, TikaException {
 
 		/*----------------configure this parser by ParseContext Object---------------------*/
-		config = context.get(GeoParserConfig.class, config);
-		String nerModelPath = config.getNERPath();
 
+		this.config = context.get(GeoParserConfig.class, config);
+		initialize(this.config.getNerModelUrl());
 		if (!isAvailable()) {
 			return;
 		}
 
 		/*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
-		NameEntityExtractor extractor = new NameEntityExtractor(nerModelPath);
 		extractor.getAllNameEntitiesfromInput(stream);
 		extractor.getBestNameEntity();
 		ArrayList<String> locationNameEntities = extractor.locationNameEntities;
@@ -146,10 +175,10 @@ public class GeoParser extends AbstractP
 	}
 
 	public boolean isAvailable() {
-		return ExternalParser.check(new String[] { "lucene-geo-gazetteer",
-				"--help" }, -1)
-				&& config.getNERPath() != null
-				&& !config.getNERPath().equals("");
+		if (!initialized) {
+			initialize(config.getNerModelUrl());
+		}
+		return this.available;
 	}
 
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java Sun Nov 15 19:53:14 2015
@@ -19,22 +19,16 @@ package org.apache.tika.parser.geo.topic
 
 import java.io.File;
 import java.io.Serializable;
-import java.net.URISyntaxException;
+import java.net.MalformedURLException;
+import java.net.URL;
 
 public class GeoParserConfig implements Serializable {
 
-	private static final long serialVersionUID = 1L;
-	private String nerModelPath = null;
+	private static final long serialVersionUID = 2L;
+	private URL nerModelUrl = null;
 
 	public GeoParserConfig() {
-		try {
-			if (GeoParserConfig.class.getResource("en-ner-location.bin") != null) {
-				this.nerModelPath = new File(GeoParserConfig.class.getResource(
-						"en-ner-location.bin").toURI()).getAbsolutePath();
-			}
-		} catch (URISyntaxException e) {
-			e.printStackTrace();
-		}
+		this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin");
 	}
 
 	public void setNERModelPath(String path) {
@@ -44,11 +38,19 @@ public class GeoParserConfig implements
 		if (file.isDirectory() || !file.exists()) {
 			return;
 		}
-		nerModelPath = path;
+		try {
+			this.nerModelUrl = file.toURI().toURL();
+		} catch (MalformedURLException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	public void setNerModelUrl(URL url) {
+		this.nerModelUrl = url;
 	}
 
-	public String getNERPath() {
-		return nerModelPath;
+	public URL getNerModelUrl() {
+		return nerModelUrl;
 	}
 
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Sun Nov 15 19:53:14 2015
@@ -17,9 +17,10 @@
 
 package org.apache.tika.parser.geo.topic;
 
-import java.io.FileInputStream;
+
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -30,25 +31,24 @@ import java.util.Map;
 
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.Span;
-
 import org.apache.commons.io.IOUtils;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 public class NameEntityExtractor {
-	private String nerModelPath = null;
+
 	ArrayList<String> locationNameEntities;
 	String bestNameEntity;
 	private HashMap<String, Integer> tf;
+	private final NameFinderME nameFinder;
 
-	public NameEntityExtractor(String nerModelpath) {
+	public NameEntityExtractor(URL modelUrl) throws IOException {
 		this.locationNameEntities = new ArrayList<String>();
 		this.bestNameEntity = null;
-		this.nerModelPath = nerModelpath;
-		tf = new HashMap<String, Integer>();
-
+		TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
+		this.nameFinder = new NameFinderME(model);
+		this.tf = new HashMap<String, Integer>();
 	}
 
 	/*
@@ -60,18 +60,20 @@ public class NameEntityExtractor {
 	 */
 
 	public void getAllNameEntitiesfromInput(InputStream stream)
-			throws InvalidFormatException, IOException {
+			throws IOException {
 
-		InputStream modelIn = new FileInputStream(nerModelPath);
-		TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
-		NameFinderME nameFinder = new NameFinderME(model);
-		String[] in = IOUtils.toString(stream, UTF_8).split(" ");
 
-		Span nameE[] = nameFinder.find(in);
+		String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+		Span nameE[];
+		//name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+		synchronized (nameFinder) {
+			nameE = nameFinder.find(in);
+			//the same name finder is reused, so clear adaptive data
+			nameFinder.clearAdaptiveData();
+		}
 
 		String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
 		spanNames = spanNames.substring(1, spanNames.length() - 1);
-		modelIn.close();
 		String[] tmp = spanNames.split(",");
 
 		for (String name : tmp) {
@@ -79,6 +81,7 @@ public class NameEntityExtractor {
 			this.locationNameEntities.add(name);
 		}
 
+
 	}
 
 	/*
@@ -123,5 +126,4 @@ public class NameEntityExtractor {
 			}
 		}
 	}
-
 }