You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/03/02 06:42:18 UTC

[09/20] tika git commit: created NLTK host server properties

created NLTK host server properties


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ac4c0b2c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ac4c0b2c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ac4c0b2c

Branch: refs/heads/master
Commit: ac4c0b2c9321bba395b92214a3504d8346c3e936
Parents: f054bcd
Author: manali <ma...@gmail.com>
Authored: Wed Feb 24 22:23:26 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Wed Feb 24 22:23:26 2016 -0800

----------------------------------------------------------------------
 tika-parsers/pom.xml                            | 15 ++--
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  | 72 ++++++++++----------
 .../tika/parser/ner/nltk/NLTKServer.properties  | 16 +++++
 .../parser/ner/nltk/NLTKNERecogniserTest.java   |  8 +--
 4 files changed, 63 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 41daf4d..088a6e9 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -88,13 +88,6 @@
       <version>2.1.1</version>
     </dependency>
 
-
-    <!-- manali added this-->
-    <dependency>
-      <groupId>org.apache.httpcomponents</groupId>
-      <artifactId>httpclient</artifactId>
-      <version>4.5.1</version>
-    </dependency>
     <!-- Optional OSGi dependencies, used only when running within OSGi -->
     <dependency>
       <groupId>org.apache.felix</groupId>
@@ -366,6 +359,14 @@
       <version>3.2.2</version>
       <scope>provided</scope>
     </dependency>
+
+    <!--Jackson parse String to JSON-->
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+      <version>2.7.1</version>
+    </dependency>
+
   </dependencies>
 
   <build>

http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
index 99cde6f..eddddcb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -16,38 +16,18 @@
  */
 package org.apache.tika.parser.ner.nltk;
 
-import org.apache.http.client.methods.HttpGet;
 import org.apache.tika.parser.ner.NERecogniser;
-import org.json.simple.JSONArray;
 import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import org.apache.http.HttpResponse;
-import org.apache.http.NameValuePair;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.entity.UrlEncodedFormEntity;
-import org.apache.http.client.methods.HttpPost;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.http.message.BasicNameValuePair;
-
-import javax.ws.rs.core.Form;
+import java.io.*;
+import java.util.*;
 import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 
 import org.apache.cxf.jaxrs.client.WebClient;
-import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
-import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 
 /**
  *  This class offers an implementation of {@link NERecogniser} based on
@@ -59,9 +39,8 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 public class NLTKNERecogniser implements NERecogniser {
 
     private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
-    private final static String USER_AGENT = "Mozilla/5.0";
     private static boolean available = false;
-    
+    private static final String NLTK_REST_HOST = "http://localhost:8881";
      /**
      * some common entities identified by NLTK
      */
@@ -75,12 +54,31 @@ public class NLTKNERecogniser implements NERecogniser {
         add(DATE);
         add(FACILITY);
         add(GPE);
+        add("NAMES");
     }};
 
+    String restHostUrlStr;
     public NLTKNERecogniser(){
         try {
-            String url = "http://localhost:5000/";
-            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).get();
+
+            String restHostUrlStr="";
+            try {
+                restHostUrlStr = readRestUrl();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+
+            if (restHostUrlStr == null
+                    || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+                this.restHostUrlStr = NLTK_REST_HOST;
+            } else {
+                this.restHostUrlStr = restHostUrlStr;
+            }
+
+
+
+
+            Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
             int responseCode = response.getStatus();
             if(responseCode == 200){
                 available = true;
@@ -94,6 +92,13 @@ public class NLTKNERecogniser implements NERecogniser {
         }
     }
 
+    private static String readRestUrl() throws IOException {
+        Properties nltkProperties = new Properties();
+        nltkProperties.load(NLTKNERecogniser.class
+                .getResourceAsStream("NLTKServer.properties"));
+
+        return nltkProperties.getProperty("nltk.server.url");
+    }
 
     /**
      * @return {@code true} if server endpoint is available.
@@ -119,22 +124,15 @@ public class NLTKNERecogniser implements NERecogniser {
     public Map<String, Set<String>> recognise(String text) {
         Map<String, Set<String>> entities = new HashMap<>();
         try {
-            String url = "http://localhost:5000/nltk";
-            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).form(new Form().param("text",text));
+            int port = 8881;
+            String url = restHostUrlStr + "/nltk";
+            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
             int responseCode = response.getStatus();
             if (responseCode == 200) {
                 String result = response.readEntity(String.class);
                 JSONParser parser = new JSONParser();
                 JSONObject j = (JSONObject) parser.parse(result);
-                JSONArray aa = new JSONArray();
-                for (Object x : j.keySet()) {
-                    aa = (JSONArray) j.get(x.toString());
-                    Set s = new HashSet();
-                    for (Object y : aa) {
-                        s.add(y.toString());
-                    }
-                    entities.put(x.toString(), s);
-                }
+                Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
             }
         }
         catch (Exception e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..24f5a2e
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+nltk.server.url=http://localhost:5000

http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
index 2861051..a40ec24 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -42,13 +42,13 @@ public class NLTKNERecogniserTest {
         Metadata md = new Metadata();
         tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
 
-        Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE")));
-        if(gpe.size() == 0) {
+        Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+        if(names.size() == 0) {
             return;
         }
         else {
-            assertTrue(gpe.contains("America"));
-            assertTrue(gpe.size() == 1); //and nothing else
+            assertTrue(names.contains("America"));
+            assertTrue(names.size() == 1); //and nothing else
         }
     }
 }