You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/10 00:49:21 UTC

[tika] branch main updated (340c01f -> 1957a60)

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 340c01f  Added note about PeterAlfredLee code cleanup work
     new 3650837  Added missing test file for GrobidRESTParser
     new 1957a60  TIKA-3191: Updated GrobidRESTParser to use new API location

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   2 +
 .../tika/parser/journal/GrobidRESTParser.java      |  51 ++++++++++++++-------
 .../resources/test-documents/testJournalParser.pdf | Bin
 3 files changed, 37 insertions(+), 16 deletions(-)
 copy {tika-parser-modules/tika-parser-pdf-module => tika-parsers-advanced/tika-parser-nlp-module}/src/test/resources/test-documents/testJournalParser.pdf (100%)


[tika] 01/02: Added missing test file for GrobidRESTParser

Posted by dm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 36508370c3c91595288f8a5ef8e1874585a5e5d8
Author: David Meikle <dm...@apache.org>
AuthorDate: Tue Nov 10 00:41:44 2020 +0000

    Added missing test file for GrobidRESTParser
---
 .../resources/test-documents/testJournalParser.pdf     | Bin 0 -> 985125 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/test-documents/testJournalParser.pdf b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/test-documents/testJournalParser.pdf
new file mode 100644
index 0000000..0abda73
Binary files /dev/null and b/tika-parsers-advanced/tika-parser-nlp-module/src/test/resources/test-documents/testJournalParser.pdf differ


[tika] 02/02: TIKA-3191: Updated GrobidRESTParser to use new API location

Posted by dm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1957a60575075fe60e367a506bdbf0136d653547
Author: David Meikle <dm...@apache.org>
AuthorDate: Tue Nov 10 00:42:18 2020 +0000

    TIKA-3191: Updated GrobidRESTParser to use new API location
---
 CHANGES.txt                                        |  2 +
 .../tika/parser/journal/GrobidRESTParser.java      | 51 +++++++++++++++-------
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 5e66246..2de6fb2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -42,6 +42,8 @@ Release 1.25 - ???
 
    * Read hyperlinked images from ODT files (TIKA-3156).
 
+   * Updated GrobidRESTParser to use new API location (TIKA-3191).
+
 Release 1.24.1 - 4/17/2020
 
    * Allow gzip compression of input and output streams for tika-server (TIKA-3073).
diff --git a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
index 110c504..4eb1e31 100644
--- a/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
+++ b/tika-parsers-advanced/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
@@ -29,6 +29,7 @@ import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
 import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.slf4j.Logger;
@@ -39,14 +40,15 @@ public class GrobidRESTParser {
 
     private static final Logger LOG = LoggerFactory.getLogger(GrobidRESTParser.class);
 
-
     private static final String GROBID_REST_HOST = "http://localhost:8080";
 
-    private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
-    // doesn't work
-    // nfc why
+    private Boolean legacyMode = null;
+
+    private static final String GROBID_ISALIVE_PATH = "/api/isalive";
+    private static final String GROBID_PROCESSHEADER_PATH = "/api/processHeaderDocument";
 
-    private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+    private static final String GROBID_LEGACY_ISALIVE_PATH = "/grobid";
+    private static final String GROBID_LEGACY_PROCESSHEADER_PATH = "/processHeaderDocument";
 
     private String restHostUrlStr;
 
@@ -58,8 +60,7 @@ public class GrobidRESTParser {
             LOG.warn("can't read rest url", e);
         }
 
-        if (restHostUrlStr == null
-                || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+        if (restHostUrlStr == null || restHostUrlStr.equals("")) {
             this.restHostUrlStr = GROBID_REST_HOST;
         } else {
             this.restHostUrlStr = restHostUrlStr;
@@ -75,12 +76,15 @@ public class GrobidRESTParser {
         Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
         MultipartBody body = new MultipartBody(att);
 
-        Response response = WebClient
-                .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
-                .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
-                .post(body);
-
         try {
+            checkMode();
+            Response response = WebClient
+                    .create(restHostUrlStr
+                            + (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH : GROBID_PROCESSHEADER_PATH))
+                    .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+                    .post(body);
+
+
             String resp = response.readEntity(String.class);
             Metadata teiMet = new TEIDOMParser().parse(resp, context);
             for (String key : teiMet.names()) {
@@ -99,14 +103,29 @@ public class GrobidRESTParser {
         return grobidProperties.getProperty("grobid.server.url");
     }
 
+    private void checkMode() throws TikaException {
+        if (legacyMode != null) {
+            return;
+        }
+        Response response = WebClient.create(restHostUrlStr + GROBID_ISALIVE_PATH).head();
+        if (response.getStatus() == 200) {
+            legacyMode = false;
+            return;
+        }
+        response = WebClient.create(restHostUrlStr + GROBID_LEGACY_ISALIVE_PATH).head();
+        if (response.getStatus() == 200) {
+            legacyMode = true;
+            return;
+        }
+        throw new TikaException("Cannot connect to Grobid Service");
+    }
+
     protected static boolean canRun() {
         Response response = null;
-
         try {
-            response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH)
-                    .accept(MediaType.TEXT_HTML).get();
+            response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH).get();
             String resp = response.readEntity(String.class);
-            return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+            return resp != null && !resp.equals("") && resp.startsWith("true");
         } catch (Exception e) {
             //swallow...can't run
             return false;