You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/11/15 20:56:25 UTC
svn commit: r1714493 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
Author: nick
Date: Sun Nov 15 19:56:25 2015
New Revision: 1714493
URL: http://svn.apache.org/viewvc?rev=1714493&view=rev
Log:
Fix inconsistent whitespace
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714493&r1=1714492&r2=1714493&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Sun Nov 15 19:56:25 2015
@@ -46,139 +46,139 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class GeoParser extends AbstractParser {
- private static final long serialVersionUID = -2241391757440215491L;
- private static final Logger LOG = Logger.getLogger(GeoParser.class.getName());
- private static final MediaType MEDIA_TYPE =
- MediaType.application("geotopic");
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MEDIA_TYPE);
- private GeoParserConfig config = new GeoParserConfig();
-
- private boolean initialized;
- private URL modelUrl;
- private NameEntityExtractor extractor;
- private boolean available;
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
- return SUPPORTED_TYPES;
- }
-
- /**
- * Initializes this parser
- * @param modelUrl the URL to NER model
- */
- public void initialize(URL modelUrl) {
-
- if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
- //previously initialized for the same URL
- return;
- }
- this.modelUrl = modelUrl;
- //if NER model is available and lucene-geo-gazetteer is available
- this.available = modelUrl != null &&
- ExternalParser.check(new String[] { "lucene-geo-gazetteer", "--help" }, -1);
- if (this.available) {
- try {
- this.extractor = new NameEntityExtractor(modelUrl);
- } catch (Exception e) {
- e.printStackTrace();
- this.available = false;
- }
- }
- initialized = true;
-
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
-
- /*----------------configure this parser by ParseContext Object---------------------*/
-
- this.config = context.get(GeoParserConfig.class, config);
- initialize(this.config.getNerModelUrl());
- if (!isAvailable()) {
- return;
- }
-
- /*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
- extractor.getAllNameEntitiesfromInput(stream);
- extractor.getBestNameEntity();
- ArrayList<String> locationNameEntities = extractor.locationNameEntities;
- String bestner = extractor.bestNameEntity;
-
- /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
- HashMap<String, ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
-
- /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
- GeoTag geotag = new GeoTag();
- geotag.toGeoTag(resolvedGeonames, bestner);
-
- /* add resolved entities in metadata */
-
- metadata.add("Geographic_NAME", geotag.Geographic_NAME);
- metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
- metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
- for (int i = 0; i < geotag.alternatives.size(); ++i) {
- GeoTag alter = (GeoTag) geotag.alternatives.get(i);
- metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
- metadata.add("Optional_LONGITUDE" + (i + 1),
- alter.Geographic_LONGTITUDE);
- metadata.add("Optional_LATITUDE" + (i + 1),
- alter.Geographic_LATITUDE);
- }
- }
-
- public HashMap<String, ArrayList<String>> searchGeoNames(
- ArrayList<String> locationNameEntities) throws ExecuteException,
- IOException {
- CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
- ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
- cmdLine.addArgument("-s");
- for (String name : locationNameEntities) {
- cmdLine.addArgument(name);
- }
-
- LOG.fine("Executing: " + cmdLine);
- DefaultExecutor exec = new DefaultExecutor();
- exec.setExitValue(0);
- ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
- exec.setWatchdog(watchdog);
- PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
- exec.setStreamHandler(streamHandler);
- int exitValue = exec.execute(cmdLine,
- EnvironmentUtils.getProcEnvironment());
- String outputJson = outputStream.toString("UTF-8");
- JSONArray json = (JSONArray) JSONValue.parse(outputJson);
-
- HashMap<String, ArrayList<String>> returnHash = new HashMap<String, ArrayList<String>>();
- for (int i = 0; i < json.size(); i++) {
- JSONObject obj = (JSONObject) json.get(i);
- for (Object key : obj.keySet()) {
- String theKey = (String) key;
- JSONArray vals = (JSONArray) obj.get(theKey);
- ArrayList<String> stringVals = new ArrayList<String>(
- vals.size());
- for (int j = 0; j < vals.size(); j++) {
- String val = (String) vals.get(j);
- stringVals.add(val);
- }
-
- returnHash.put(theKey, stringVals);
- }
- }
-
- return returnHash;
-
- }
-
- public boolean isAvailable() {
- if (!initialized) {
- initialize(config.getNerModelUrl());
- }
- return this.available;
- }
-
+ private static final long serialVersionUID = -2241391757440215491L;
+ private static final Logger LOG = Logger.getLogger(GeoParser.class.getName());
+ private static final MediaType MEDIA_TYPE =
+ MediaType.application("geotopic");
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MEDIA_TYPE);
+
+ private GeoParserConfig config = new GeoParserConfig();
+
+ private boolean initialized;
+ private URL modelUrl;
+ private NameEntityExtractor extractor;
+ private boolean available;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Initializes this parser
+ * @param modelUrl the URL to NER model
+ */
+ public void initialize(URL modelUrl) {
+ if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
+ // Previously initialized for the same URL
+ return;
+ }
+
+ this.modelUrl = modelUrl;
+ //if NER model is available and lucene-geo-gazetteer is available
+ this.available = modelUrl != null &&
+ ExternalParser.check(new String[] { "lucene-geo-gazetteer", "--help" }, -1);
+ if (this.available) {
+ try {
+ this.extractor = new NameEntityExtractor(modelUrl);
+ } catch (Exception e) {
+ e.printStackTrace();
+ this.available = false;
+ }
+ }
+ initialized = true;
+
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ /*----------------configure this parser by ParseContext Object---------------------*/
+
+ this.config = context.get(GeoParserConfig.class, config);
+ initialize(this.config.getNerModelUrl());
+ if (!isAvailable()) {
+ return;
+ }
+
+ /*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
+ extractor.getAllNameEntitiesfromInput(stream);
+ extractor.getBestNameEntity();
+ ArrayList<String> locationNameEntities = extractor.locationNameEntities;
+ String bestner = extractor.bestNameEntity;
+
+ /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
+ HashMap<String, ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
+
+ /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
+ GeoTag geotag = new GeoTag();
+ geotag.toGeoTag(resolvedGeonames, bestner);
+
+ /* add resolved entities in metadata */
+
+ metadata.add("Geographic_NAME", geotag.Geographic_NAME);
+ metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
+ metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+ for (int i = 0; i < geotag.alternatives.size(); ++i) {
+ GeoTag alter = (GeoTag) geotag.alternatives.get(i);
+ metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+ metadata.add("Optional_LONGITUDE" + (i + 1),
+ alter.Geographic_LONGTITUDE);
+ metadata.add("Optional_LATITUDE" + (i + 1),
+ alter.Geographic_LATITUDE);
+ }
+ }
+
+ public HashMap<String, ArrayList<String>> searchGeoNames(
+ ArrayList<String> locationNameEntities) throws ExecuteException,
+ IOException {
+ CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ cmdLine.addArgument("-s");
+ for (String name : locationNameEntities) {
+ cmdLine.addArgument(name);
+ }
+
+ LOG.fine("Executing: " + cmdLine);
+ DefaultExecutor exec = new DefaultExecutor();
+ exec.setExitValue(0);
+ ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
+ exec.setWatchdog(watchdog);
+ PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+ exec.setStreamHandler(streamHandler);
+ int exitValue = exec.execute(cmdLine,
+ EnvironmentUtils.getProcEnvironment());
+ String outputJson = outputStream.toString("UTF-8");
+ JSONArray json = (JSONArray) JSONValue.parse(outputJson);
+
+ HashMap<String, ArrayList<String>> returnHash = new HashMap<String, ArrayList<String>>();
+ for (int i = 0; i < json.size(); i++) {
+ JSONObject obj = (JSONObject) json.get(i);
+ for (Object key : obj.keySet()) {
+ String theKey = (String) key;
+ JSONArray vals = (JSONArray) obj.get(theKey);
+ ArrayList<String> stringVals = new ArrayList<String>(
+ vals.size());
+ for (int j = 0; j < vals.size(); j++) {
+ String val = (String) vals.get(j);
+ stringVals.add(val);
+ }
+
+ returnHash.put(theKey, stringVals);
+ }
+ }
+
+ return returnHash;
+
+ }
+
+ public boolean isAvailable() {
+ if (!initialized) {
+ initialize(config.getNerModelUrl());
+ }
+ return this.available;
+ }
}