You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/03/02 06:42:18 UTC
[09/20] tika git commit: created NLTK host server properties
created NLTK host server properties
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ac4c0b2c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ac4c0b2c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ac4c0b2c
Branch: refs/heads/master
Commit: ac4c0b2c9321bba395b92214a3504d8346c3e936
Parents: f054bcd
Author: manali <ma...@gmail.com>
Authored: Wed Feb 24 22:23:26 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Wed Feb 24 22:23:26 2016 -0800
----------------------------------------------------------------------
tika-parsers/pom.xml | 15 ++--
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 72 ++++++++++----------
.../tika/parser/ner/nltk/NLTKServer.properties | 16 +++++
.../parser/ner/nltk/NLTKNERecogniserTest.java | 8 +--
4 files changed, 63 insertions(+), 48 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 41daf4d..088a6e9 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -88,13 +88,6 @@
<version>2.1.1</version>
</dependency>
-
- <!-- manali added this-->
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpclient</artifactId>
- <version>4.5.1</version>
- </dependency>
<!-- Optional OSGi dependencies, used only when running within OSGi -->
<dependency>
<groupId>org.apache.felix</groupId>
@@ -366,6 +359,14 @@
<version>3.2.2</version>
<scope>provided</scope>
</dependency>
+
+ <!--Jackson parse String to JSON-->
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <version>2.7.1</version>
+ </dependency>
+
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 99cde6f..eddddcb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -16,38 +16,18 @@
*/
package org.apache.tika.parser.ner.nltk;
-import org.apache.http.client.methods.HttpGet;
import org.apache.tika.parser.ner.NERecogniser;
-import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import org.apache.http.HttpResponse;
-import org.apache.http.NameValuePair;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.entity.UrlEncodedFormEntity;
-import org.apache.http.client.methods.HttpPost;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.http.message.BasicNameValuePair;
-
-import javax.ws.rs.core.Form;
+import java.io.*;
+import java.util.*;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.cxf.jaxrs.client.WebClient;
-import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
-import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
/**
* This class offers an implementation of {@link NERecogniser} based on
@@ -59,9 +39,8 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
public class NLTKNERecogniser implements NERecogniser {
private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
- private final static String USER_AGENT = "Mozilla/5.0";
private static boolean available = false;
-
+ private static final String NLTK_REST_HOST = "http://localhost:8881";
/**
* some common entities identified by NLTK
*/
@@ -75,12 +54,31 @@ public class NLTKNERecogniser implements NERecogniser {
add(DATE);
add(FACILITY);
add(GPE);
+ add("NAMES");
}};
+ String restHostUrlStr;
public NLTKNERecogniser(){
try {
- String url = "http://localhost:5000/";
- Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).get();
+
+ String restHostUrlStr="";
+ try {
+ restHostUrlStr = readRestUrl();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (restHostUrlStr == null
+ || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+ this.restHostUrlStr = NLTK_REST_HOST;
+ } else {
+ this.restHostUrlStr = restHostUrlStr;
+ }
+
+
+
+
+ Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
int responseCode = response.getStatus();
if(responseCode == 200){
available = true;
@@ -94,6 +92,13 @@ public class NLTKNERecogniser implements NERecogniser {
}
}
+ private static String readRestUrl() throws IOException {
+ Properties nltkProperties = new Properties();
+ nltkProperties.load(NLTKNERecogniser.class
+ .getResourceAsStream("NLTKServer.properties"));
+
+ return nltkProperties.getProperty("nltk.server.url");
+ }
/**
* @return {@code true} if server endpoint is available.
@@ -119,22 +124,15 @@ public class NLTKNERecogniser implements NERecogniser {
public Map<String, Set<String>> recognise(String text) {
Map<String, Set<String>> entities = new HashMap<>();
try {
- String url = "http://localhost:5000/nltk";
- Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).form(new Form().param("text",text));
+ int port = 8881;
+ String url = restHostUrlStr + "/nltk";
+ Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
int responseCode = response.getStatus();
if (responseCode == 200) {
String result = response.readEntity(String.class);
JSONParser parser = new JSONParser();
JSONObject j = (JSONObject) parser.parse(result);
- JSONArray aa = new JSONArray();
- for (Object x : j.keySet()) {
- aa = (JSONArray) j.get(x.toString());
- Set s = new HashSet();
- for (Object y : aa) {
- s.add(y.toString());
- }
- entities.put(x.toString(), s);
- }
+ Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
}
}
catch (Exception e) {
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..24f5a2e
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+nltk.server.url=http://localhost:5000
http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 2861051..a40ec24 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -42,13 +42,13 @@ public class NLTKNERecogniserTest {
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
- Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
- if(gpe.size() == 0) {
+ Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+ if(names.size() == 0) {
return;
}
else {
- assertTrue(gpe.contains("America"));
- assertTrue(gpe.size() == 1); //and nothing else
+ assertTrue(names.contains("America"));
+ assertTrue(names.size() == 1); //and nothing else
}
}
}