You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:23:53 UTC

[tika] branch main updated: Update Grobid parsers (#1280)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6d8dbcdd3 Update Grobid parsers (#1280)
6d8dbcdd3 is described below

commit 6d8dbcdd35f448623a0080ec0ecdc62f93dc1359
Author: Luca Foppiano <lf...@users.noreply.github.com>
AuthorDate: Wed Aug 16 00:23:48 2023 +0900

    Update Grobid parsers (#1280)
    
    * Update default values
    
    * improve check mechanism, correct config file
---
 .../tika/parser/journal/GrobidRESTParser.java      |  7 ++---
 .../tika/parser/ner/grobid/GrobidNERecogniser.java | 31 +++++++++++++++-------
 .../tika/parser/journal/GrobidExtractor.properties |  2 +-
 .../tika/parser/ner/grobid/GrobidServer.properties |  4 +--
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
index cc2841880..d7aedd660 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
@@ -41,7 +41,7 @@ public class GrobidRESTParser {
 
     private static final Logger LOG = LoggerFactory.getLogger(GrobidRESTParser.class);
 
-    private static final String GROBID_REST_HOST = "http://localhost:8080";
+    private static final String GROBID_REST_HOST = "http://localhost:8070";
     private static final String GROBID_ISALIVE_PATH = "/api/isalive";
     private static final String GROBID_PROCESSHEADER_PATH = "/api/processHeaderDocument";
     private static final String GROBID_LEGACY_ISALIVE_PATH = "/grobid";
@@ -96,8 +96,9 @@ public class GrobidRESTParser {
         try {
             checkMode();
             Response response = WebClient.create(restHostUrlStr +
-                    (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH : GROBID_PROCESSHEADER_PATH))
-                    .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+                            (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH : GROBID_PROCESSHEADER_PATH))
+                    .accept(MediaType.APPLICATION_XML)
+                    .type(MediaType.MULTIPART_FORM_DATA)
                     .post(body);
 
 
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
index 1f173e381..1812b4b82 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
@@ -47,10 +47,11 @@ public class GrobidNERecogniser implements NERecogniser {
             add("NORMALIZED_MEASUREMENTS");
             add("MEASUREMENT_TYPES");
         }
-        };
+    };
     private static final Logger LOG = LoggerFactory.getLogger(GrobidNERecogniser.class);
-    private static final String GROBID_REST_HOST = "http://localhost:8080";
-    private static boolean available = false;
+    private static final String GROBID_REST_HOST = "http://localhost:8060";
+    private static final String ISALIVE_URL = "/service/isalive";
+    private boolean available = false;
     private String restHostUrlStr;
 
 
@@ -62,7 +63,6 @@ public class GrobidNERecogniser implements NERecogniser {
                 restHostUrlStr = readRestUrl();
             } catch (IOException e) {
                 LOG.warn("couldn't read rest url", e);
-
             }
 
             if (restHostUrlStr == null || restHostUrlStr.equals("")) {
@@ -71,18 +71,30 @@ public class GrobidNERecogniser implements NERecogniser {
                 this.restHostUrlStr = restHostUrlStr;
             }
 
+            this.available = isServerAlive(restHostUrlStr);
+
+        } catch (Exception e) {
+            LOG.info(e.getMessage(), e);
+        }
+    }
+
+    private static boolean isServerAlive(String restHostUrlStr) {
+        boolean available = false;
+        try {
             Response response =
-                    WebClient.create(restHostUrlStr).accept(MediaType.APPLICATION_JSON).get();
+                    WebClient.create(restHostUrlStr + ISALIVE_URL)
+                            .get();
             int responseCode = response.getStatus();
             if (responseCode == 200) {
                 available = true;
             } else {
-                LOG.info("Grobid REST Server is not running");
+                LOG.info("Grobid Quantities REST Server is not running");
             }
-
         } catch (Exception e) {
-            LOG.info(e.getMessage(), e);
+            LOG.info("Grobid Quantities REST Server is not running", e);
         }
+
+        return available;
     }
 
     /**
@@ -173,7 +185,8 @@ public class GrobidNERecogniser implements NERecogniser {
         try {
             String url = restHostUrlStr + readRestEndpoint();
             Response response =
-                    WebClient.create(url).accept(MediaType.APPLICATION_JSON).post("text=" + text);
+                    WebClient.create(url).accept(MediaType.APPLICATION_JSON)
+                            .post("text=" + text);
             int responseCode = response.getStatus();
 
             if (responseCode == 200) {
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
index 488f0c593..002b0430c 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
@@ -13,4 +13,4 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-grobid.server.url=http://localhost:8080
+grobid.server.url=http://localhost:8070
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
index 3fc609af4..0803e38fd 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
@@ -13,5 +13,5 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-grobid.server.url=http://localhost:8080
-grobid.endpoint.text=/processQuantityText
+grobid.server.url=http://localhost:8060
+grobid.endpoint.text=/service/processQuantityText