You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/08/17 06:10:12 UTC
svn commit: r1696191 - in /tika/trunk/tika-parsers: ./
src/main/java/org/apache/tika/parser/journal/
src/main/resources/META-INF/services/
src/main/resources/org/apache/tika/parser/journal/
src/test/java/org/apache/tika/parser/journal/ src/test/resourc...
Author: mattmann
Date: Mon Aug 17 04:10:11 2015
New Revision: 1696191
URL: http://svn.apache.org/r1696191
Log:
TIKA-1699: refactored GROBID parser to use GROBID rest API. Only introduced 2 deps, CXF client, and also org.json. very small and works great. Thanks to Sujen Shah for his initial work on the GROBID patch.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf (with props)
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1696191&r1=1696190&r2=1696191&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon Aug 17 04:10:11 2015
@@ -45,6 +45,7 @@
<vorbis.version>0.6</vorbis.version>
<pdfbox.version>1.8.10</pdfbox.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
+ <cxf.version>3.0.3</cxf.version>
</properties>
<dependencies>
@@ -231,6 +232,12 @@
<artifactId>junrar</artifactId>
<version>0.7</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-rs-client</artifactId>
+ <version>${cxf.version}</version>
+ </dependency>
+
<!-- Provided dependencies -->
<dependency>
@@ -269,6 +276,13 @@
</exclusion>
</exclusions>
</dependency>
+
+ <dependency>
+ <groupId>org.json</groupId>
+ <artifactId>json</artifactId>
+ <version>20140107</version>
+ </dependency>
+
<!-- Test dependencies -->
<dependency>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+
+public class GrobidRESTParser {
+
+ private static final String GROBID_REST_HOST = "http://localhost:8080";
+
+ private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
+ // doesn't work
+ // nfc why
+
+ private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+
+ private static String restHostUrlStr;
+
+ public GrobidRESTParser(String restHostUrlStr){
+ if (restHostUrlStr == null){
+ GrobidRESTParser.restHostUrlStr = GROBID_REST_HOST;
+ }
+ else{
+ GrobidRESTParser.restHostUrlStr = restHostUrlStr;
+ }
+ }
+
+ public void parse(String filePath, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws FileNotFoundException {
+
+ File pdfFile = new File(filePath);
+ ContentDisposition cd = new ContentDisposition(
+ "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\"");
+ Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
+ MultipartBody body = new MultipartBody(att);
+
+ Response response = WebClient
+ .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
+ .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+ .post(body);
+
+ try {
+ String resp = response.readEntity(String.class);
+ Metadata teiMet = new TEIParser().parse(resp);
+ for(String key: teiMet.names()){
+ metadata.add("grobid:header_"+key, teiMet.get(key));
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ protected static boolean canRun() {
+ Response response = null;
+
+ try {
+ response = WebClient.create(restHostUrlStr + GROBID_ISALIVE_PATH)
+ .accept(MediaType.TEXT_HTML).get();
+ String resp = response.readEntity(String.class);
+ return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+ } catch (Exception e) {
+ e.printStackTrace();
+ return false;
+ }
+ }
+
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/JournalParser.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JournalParser extends AbstractParser {
+
+ /**
+ * Generated serial ID
+ */
+ private static final long serialVersionUID = 4664255544154296438L;
+
+ private static final MediaType TYPE = MediaType.application("pdf");
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+ File tmpFile = tis.getFile();
+
+ Properties grobidProperties = new Properties();
+ grobidProperties.load(JournalParser.class
+ .getResourceAsStream("GrobidExtractor.properties"));
+
+ GrobidRESTParser grobidParser = new GrobidRESTParser(
+ grobidProperties.getProperty("grobid.server.url"));
+ grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);
+
+ PDFParser parser = new PDFParser();
+ parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,893 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.json.XML;
+
+public class TEIParser {
+
+ public TEIParser() {
+ }
+
+ public Metadata parse(String source) {
+ JSONObject obj = XML.toJSONObject(source);
+ Metadata metadata = new Metadata();
+ createGrobidMetadata(source, obj, metadata);
+ return metadata;
+ }
+
+ private void createGrobidMetadata(String source, JSONObject obj,
+ Metadata metadata) {
+ if (obj != null) {
+ JSONObject teiHeader = obj.getJSONObject("TEI")
+ .getJSONObject("teiHeader");
+ if (teiHeader.has("text")) {
+ parseText(teiHeader.getJSONObject("text"), metadata);
+ }
+
+ if (teiHeader.has("fileDesc")) {
+ parseFileDesc(teiHeader.getJSONObject("fileDesc"), metadata);
+
+ }
+ if (teiHeader.has("profileDesc")) {
+ parseProfileDesc(teiHeader.getJSONObject("profileDesc"), metadata);
+ }
+ }
+
+ addStaticMet(source, obj, metadata);
+ }
+
+ private void addStaticMet(String source, JSONObject obj, Metadata metadata) {
+ metadata.add("Class", Metadata.class.getName());
+ metadata.add("TEIJSONSource", obj.toString());
+ metadata.add("TEIXMLSource", source);
+ }
+
+ private void parseText(JSONObject text, Metadata metadata) {
+ if (text.has("xml:lang")) {
+ metadata.add("Language", text.getString("xml:lang"));
+ }
+ }
+
+ private void parseFileDesc(JSONObject fileDesc, Metadata metadata) {
+ if (fileDesc.has("titleStmt")) {
+ parseTitleStmt(fileDesc.getJSONObject("titleStmt"), metadata);
+ }
+
+ if (fileDesc.has("sourceDesc")) {
+ parseSourceDesc(fileDesc.getJSONObject("sourceDesc"), metadata);
+ }
+ }
+
+ private void parseTitleStmt(JSONObject titleStmt, Metadata metadata) {
+ if (titleStmt.has("title")) {
+ JSONObject title = titleStmt.getJSONObject("title");
+ if (title.has("content")) {
+ metadata.add("Title", title.getString("content"));
+ }
+ }
+ }
+
+ private void parseSourceDesc(JSONObject sourceDesc, Metadata metadata) {
+ if (sourceDesc.has("biblStruct")) {
+ parseBiblStruct(sourceDesc.getJSONObject("biblStruct"), metadata);
+ }
+ }
+
+ private void parseBiblStruct(JSONObject biblStruct, Metadata metadata) {
+ if (biblStruct.has("analytic")
+ && biblStruct.get("analytic") instanceof JSONObject) {
+ JSONObject analytic = biblStruct.getJSONObject("analytic");
+ if (analytic.has("author")) {
+ Object authorObj = analytic.get("author");
+
+ List<Author> authorList = new ArrayList<Author>();
+ if (authorObj instanceof JSONObject) {
+ parseAuthor((JSONObject) authorObj, authorList);
+ } else if (authorObj instanceof JSONArray) {
+ JSONArray authors = (JSONArray) authorObj;
+ if (authors.length() > 0) {
+ for (int i = 0; i < authors.length(); i++) {
+ JSONObject author = authors.getJSONObject(i);
+ parseAuthor(author, authorList);
+ }
+ }
+
+ metadata.add("Address", getMetadataAddresses(authorList));
+ metadata.add("Affiliation", getMetadataAffiliations(authorList));
+ metadata.add("Authors", getMetadataAuthors(authorList));
+ metadata.add("FullAffiliations",
+ getMetadataFullAffiliations(authorList));
+ }
+
+ }
+ } else {
+ metadata.add("Error", "Unable to parse: no analytic section in JSON");
+ }
+
+ }
+
+ private String getMetadataFullAffiliations(List<Author> authorList) {
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAffils = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+ metAffils.append("[");
+ for (Affiliation af : unique) {
+ metAffils.append(af.toString());
+ metAffils.append(",");
+ }
+ metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1));
+ metAffils.append("]");
+ return metAffils.toString();
+ }
+
+ private String getMetadataAuthors(List<Author> authorList) {
+ // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2
+ // Steve Hughes 1
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAuthors = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+
+ for (Author a : authorList) {
+ metAuthors.append(printOrBlank(a.getFirstName()));
+ metAuthors.append(printOrBlank(a.getMiddleName()));
+ metAuthors.append(printOrBlank(a.getSurName()));
+
+ StringBuilder affilBuilder = new StringBuilder();
+ for (int idx = 0; idx < unique.size(); idx++) {
+ Affiliation af = unique.get(idx);
+ if (a.getAffiliations().contains(af)) {
+ affilBuilder.append((idx + 1));
+ affilBuilder.append(",");
+ }
+ }
+
+ if (affilBuilder.length() > 0)
+ affilBuilder.deleteCharAt(affilBuilder.length() - 1);
+
+ metAuthors.append(affilBuilder.toString());
+ metAuthors.append(" ");
+ }
+
+ return metAuthors.toString();
+ }
+
+ private String getMetadataAffiliations(List<Author> authorList) {
+ // generates 1 Jet Propulsion Laboratory California Institute of Technology
+ // ; 2 Computer Science Department University of Southern California
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAffil = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+
+ int count = 1;
+ for (Affiliation a : unique) {
+ metAffil.append(count);
+ metAffil.append(" ");
+ metAffil.append(a.getOrgName().toString());
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ metAffil.append("; ");
+ count++;
+ }
+
+ if (count > 1) {
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ }
+
+ return metAffil.toString();
+ }
+
+ private String getMetadataAddresses(List<Author> authorList) {
+ // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA",
+ List<Address> unique = new ArrayList<Address>();
+ StringBuilder metAddress = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af.getAddress())) {
+ unique.add(af.getAddress());
+ }
+ }
+ }
+
+ for (Address ad : unique) {
+ metAddress.append(ad.toString());
+ metAddress.append(" ");
+ }
+
+ return metAddress.toString();
+ }
+
+ private void parseAuthor(JSONObject authorObj, List<Author> authorList) {
+ Author author = new Author();
+
+ if (authorObj.has("persName")) {
+ JSONObject persName = authorObj.getJSONObject("persName");
+
+ if (persName.has("forename")) {
+
+ Object foreNameObj = persName.get("forename");
+
+ if (foreNameObj instanceof JSONObject) {
+ parseNamePart((JSONObject) foreNameObj, author);
+ } else if (foreNameObj instanceof JSONArray) {
+ JSONArray foreName = persName.getJSONArray("forename");
+
+ if (foreName.length() > 0) {
+ for (int i = 0; i < foreName.length(); i++) {
+ JSONObject namePart = foreName.getJSONObject(i);
+ parseNamePart(namePart, author);
+ }
+ }
+ }
+ }
+
+ if (persName.has("surname")) {
+ author.setSurName(persName.getString("surname"));
+ }
+
+ if (authorObj.has("affiliation")) {
+ parseAffiliation(authorObj.get("affiliation"), author);
+ }
+
+ }
+
+ authorList.add(author);
+ }
+
+ private void parseNamePart(JSONObject namePart, Author author) {
+ if (namePart.has("type") && namePart.has("content")) {
+ String type = namePart.getString("type");
+ String content = namePart.getString("content");
+
+ if (type.equals("first")) {
+ author.setFirstName(content);
+ }
+
+ if (type.equals("middle")) {
+ author.setMiddleName(content);
+ }
+ }
+ }
+
+ private void parseAffiliation(Object affiliationJSON, Author author) {
+ if (affiliationJSON instanceof JSONObject) {
+ parseOneAffiliation((JSONObject) affiliationJSON, author);
+ } else if (affiliationJSON instanceof JSONArray) {
+ JSONArray affiliationArray = (JSONArray) affiliationJSON;
+ if (affiliationArray != null && affiliationArray.length() > 0) {
+ for (int i = 0; i < affiliationArray.length(); i++) {
+ JSONObject affiliationObj = affiliationArray.getJSONObject(i);
+ parseOneAffiliation(affiliationObj, author);
+ }
+ }
+ }
+ }
+
+ private void parseOneAffiliation(JSONObject affiliationObj, Author author) {
+
+ Affiliation affiliation = new Affiliation();
+ if (affiliationObj.has("address")) {
+ parseAddress(affiliationObj.getJSONObject("address"), affiliation);
+ }
+
+ if (affiliationObj.has("orgName")) {
+ OrgName orgName = new OrgName();
+ Object orgObject = affiliationObj.get("orgName");
+ if (orgObject instanceof JSONObject) {
+ parseOrgName((JSONObject) orgObject, orgName);
+ } else if (orgObject instanceof JSONArray) {
+ JSONArray orgNames = (JSONArray) orgObject;
+ if (orgNames != null && orgNames.length() > 0) {
+ for (int i = 0; i < orgNames.length(); i++) {
+ parseOrgName(orgNames.getJSONObject(i), orgName);
+ }
+ }
+
+ affiliation.setOrgName(orgName);
+ }
+
+ }
+
+ author.getAffiliations().add(affiliation);
+ }
+
+ private void parseAddress(JSONObject addressObj, Affiliation affiliation) {
+ Address address = new Address();
+
+ if (addressObj.has("region")) {
+ address.setRegion(addressObj.getString("region"));
+ }
+
+ if (addressObj.has("postCode")) {
+ address.setPostCode(JSONObject.valueToString(addressObj.get("postCode")));
+ }
+
+ if (addressObj.has("settlement")) {
+ address.setSettlment(addressObj.getString("settlement"));
+ }
+
+ if (addressObj.has("country")) {
+ Country country = new Country();
+ Object countryObj = addressObj.get("country");
+
+ if (countryObj instanceof JSONObject) {
+ JSONObject countryJson = addressObj.getJSONObject("country");
+
+ if (countryJson.has("content")) {
+ country.setContent(countryJson.getString("content"));
+ }
+
+ if (countryJson.has("key")) {
+ country.setKey(countryJson.getString("key"));
+ }
+ } else if (countryObj instanceof String) {
+ country.setContent((String) countryObj);
+ }
+ address.setCountry(country);
+ }
+
+ affiliation.setAddress(address);
+ }
+
+ private void parseOrgName(JSONObject orgObj, OrgName orgName) {
+ OrgTypeName typeName = new OrgTypeName();
+ if (orgObj.has("content")) {
+ typeName.setName(orgObj.getString("content"));
+ }
+
+ if (orgObj.has("type")) {
+ typeName.setType(orgObj.getString("type"));
+ }
+
+ orgName.getTypeNames().add(typeName);
+ }
+
+ private void parseProfileDesc(JSONObject profileDesc, Metadata metadata) {
+ if (profileDesc.has("abstract")) {
+ if (profileDesc.has("p")) {
+ metadata.add("Abstract", profileDesc.getString("p"));
+ }
+ }
+
+ if (profileDesc.has("textClass")) {
+ JSONObject textClass = profileDesc.getJSONObject("textClass");
+
+ if (textClass.has("keywords")) {
+ Object keywordsObj = textClass.get("keywords");
+ // test AJ15.pdf
+ if (keywordsObj instanceof String) {
+ metadata.add("Keyword", (String) keywordsObj);
+ } else if (keywordsObj instanceof JSONObject) {
+ JSONObject keywords = textClass.getJSONObject("keywords");
+ if (keywords.has("term")) {
+ JSONArray termArr = keywords.getJSONArray("term");
+ for (int i = 0; i < termArr.length(); i++) {
+ metadata.add("Keyword", JSONObject.valueToString(termArr.get(i)));
+ }
+ }
+ }
+
+ }
+ }
+
+ }
+
+ private String printOrBlank(String val) {
+ if (val != null && !val.equals("")) {
+ return val + " ";
+ } else
+ return " ";
+ }
+
+ class Author {
+
+ private String surName;
+
+ private String middleName;
+
+ private String firstName;
+
+ private List<Affiliation> affiliations;
+
+ public Author() {
+ this.surName = null;
+ this.middleName = null;
+ this.firstName = null;
+ this.affiliations = new ArrayList<Affiliation>();
+ }
+
+ /**
+ * @return the surName
+ */
+ public String getSurName() {
+ return surName;
+ }
+
+ /**
+ * @param surName
+ * the surName to set
+ */
+ public void setSurName(String surName) {
+ this.surName = surName;
+ }
+
+ /**
+ * @return the middleName
+ */
+ public String getMiddleName() {
+ return middleName;
+ }
+
+ /**
+ * @param middleName
+ * the middleName to set
+ */
+ public void setMiddleName(String middleName) {
+ this.middleName = middleName;
+ }
+
+ /**
+ * @return the firstName
+ */
+ public String getFirstName() {
+ return firstName;
+ }
+
+ /**
+ * @param firstName
+ * the firstName to set
+ */
+ public void setFirstName(String firstName) {
+ this.firstName = firstName;
+ }
+
+ /**
+ * @return the affiliations
+ */
+ public List<Affiliation> getAffiliations() {
+ return affiliations;
+ }
+
+ /**
+ * @param affiliations
+ * the affiliations to set
+ */
+ public void setAffiliations(List<Affiliation> affiliations) {
+ this.affiliations = affiliations;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName
+ : "" + ", firstName=" + firstName + ", affiliations=" + affiliations
+ + "]";
+ }
+
+ }
+
+ class Affiliation {
+
+ private OrgName orgName;
+
+ private Address address;
+
+ public Affiliation() {
+ this.orgName = new OrgName();
+ this.address = new Address();
+ }
+
+ /**
+ * @return the orgName
+ */
+ public OrgName getOrgName() {
+ return orgName;
+ }
+
+ /**
+ * @param orgName
+ * the orgName to set
+ */
+ public void setOrgName(OrgName orgName) {
+ this.orgName = orgName;
+ }
+
+ /**
+ * @return the address
+ */
+ public Address getAddress() {
+ return address;
+ }
+
+ /**
+ * @param address
+ * the address to set
+ */
+ public void setAddress(Address address) {
+ this.address = address;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Affiliation otherA = (Affiliation) obj;
+ return this.getAddress().equals(otherA.getAddress())
+ && this.getOrgName().equals(otherA.getOrgName());
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "Affiliation {orgName=" + orgName + ", address=" + address + "}";
+ }
+
+ }
+
+ class OrgName {
+ private List<OrgTypeName> typeNames;
+
+ public OrgName() {
+ this.typeNames = new ArrayList<OrgTypeName>();
+ }
+
+ /**
+ * @return the typeNames
+ */
+ public List<OrgTypeName> getTypeNames() {
+ return typeNames;
+ }
+
+ /**
+ * @param typeNames
+ * the typeNames to set
+ */
+ public void setTypeNames(List<OrgTypeName> typeNames) {
+ this.typeNames = typeNames;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ for (OrgTypeName on : this.typeNames) {
+ builder.append(on.getName());
+ builder.append(" ");
+ }
+ return builder.toString();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ OrgName otherA = (OrgName) obj;
+
+ if (otherA.getTypeNames() != null) {
+ if (this.typeNames == null) {
+ return false;
+ } else {
+ return this.typeNames.size() == otherA.getTypeNames().size();
+ }
+ } else {
+ if (this.typeNames == null) {
+ return true;
+ } else
+ return false;
+ }
+
+ }
+
+ }
+
+ class OrgTypeName {
+ private String name;
+ private String type;
+
+ public OrgTypeName() {
+ this.name = null;
+ this.type = null;
+ }
+
+ /**
+ * @return the name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @param name
+ * the name to set
+ */
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * @return the type
+ */
+ public String getType() {
+ return type;
+ }
+
+ /**
+ * @param type
+ * the type to set
+ */
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ OrgTypeName otherOrgName = (OrgTypeName) obj;
+ return this.type.equals(otherOrgName.getType())
+ && this.name.equals(otherOrgName.getName());
+ }
+
+ }
+
+ private class Address {
+
+ private String region;
+ private String postCode;
+ private String settlment;
+ private Country country;
+
+ public Address() {
+ this.region = null;
+ this.postCode = null;
+ this.settlment = null;
+ this.country = new Country();
+ }
+
+ /**
+ * @return the region
+ */
+ public String getRegion() {
+ return region;
+ }
+
+ /**
+ * @param region
+ * the region to set
+ */
+ public void setRegion(String region) {
+ this.region = region;
+ }
+
+ /**
+ * @return the postCode
+ */
+ public String getPostCode() {
+ return postCode;
+ }
+
+ /**
+ * @param postCode
+ * the postCode to set
+ */
+ public void setPostCode(String postCode) {
+ this.postCode = postCode;
+ }
+
+ /**
+ * @return the settlment
+ */
+ public String getSettlment() {
+ return settlment;
+ }
+
+ /**
+ * @param settlment
+ * the settlment to set
+ */
+ public void setSettlment(String settlment) {
+ this.settlment = settlment;
+ }
+
+ /**
+ * @return the country
+ */
+ public Country getCountry() {
+ return country;
+ }
+
+ /**
+ * @param country
+ * the country to set
+ */
+ public void setCountry(Country country) {
+ this.country = country;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Address otherA = (Address) obj;
+ if (this.settlment == null) {
+ return otherA.getSettlment() == null;
+ } else if (this.country == null) {
+ return otherA.getCountry() == null;
+ } else if (this.postCode == null) {
+ return otherA.getPostCode() == null;
+ } else if (this.region == null) {
+ return otherA.getRegion() == null;
+ }
+
+ return this.settlment.equals(otherA.getSettlment())
+ && this.country.equals(otherA.getCountry())
+ && this.postCode.equals(otherA.getPostCode())
+ && this.region.equals(otherA.getRegion());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(settlment);
+ builder.append(", ");
+ builder.append(region);
+ builder.append(" ");
+ builder.append(postCode);
+ builder.append(" ");
+ builder.append(country.getContent());
+ return builder.toString();
+ }
+ }
+
+ private class Country {
+ private String key;
+ private String content;
+
+ public Country() {
+ this.key = null;
+ this.content = null;
+ }
+
+ /**
+ * @return the key
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
+ * @param key
+ * the key to set
+ */
+ public void setKey(String key) {
+ this.key = key;
+ }
+
+ /**
+ * @return the content
+ */
+ public String getContent() {
+ return content;
+ }
+
+ /**
+ * @param content
+ * the content to set
+ */
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Country otherC = (Country) obj;
+
+ if (this.key == null) {
+ if (otherC.getKey() != null) {
+ return false;
+ } else {
+ if (this.content == null) {
+ if (otherC.getContent() != null) {
+ return false;
+ } else {
+ return true;
+ }
+ } else {
+ return content.equals(otherC.getContent());
+ }
+ }
+ } else {
+ if (this.content == null) {
+ if (otherC.getContent() != null) {
+ return false;
+ } else {
+ return this.key.equals(otherC.getKey());
+ }
+ } else {
+ return this.key.equals(otherC.getKey())
+ && this.content.equals(otherC.getContent());
+ }
+ }
+ }
+
+ }
+}
Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1696191&r1=1696190&r2=1696191&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Mon Aug 17 04:10:11 2015
@@ -66,3 +66,4 @@ org.apache.tika.parser.isatab.ISArchiveP
org.apache.tika.parser.geoinfo.GeographicInformationParser
org.apache.tika.parser.geo.topic.GeoParser
org.apache.tika.parser.external.CompositeExternalParser
+org.apache.tika.parser.journal.JournalParser
\ No newline at end of file
Added: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties (added)
+++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties Mon Aug 17 04:10:11 2015
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+grobid.server.url=http://localhost:8080
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java?rev=1696191&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java Mon Aug 17 04:10:11 2015
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.apache.tika.parser.journal.GrobidRESTParser.canRun;
+import java.io.InputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class JournalParserTest {
+
+ @Test
+ public void testJournalParser() {
+ String path = "/test-documents/testJournalParser2.pdf";
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ assumeTrue(canRun());
+
+ InputStream stream = JournalParserTest.class.getResourceAsStream(path);
+ JournalParser jParser = new JournalParser();
+ try {
+ jParser.parse(stream, handler, metadata, new ParseContext());
+ } catch (Exception e){
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+
+ assertNotNull(metadata.get("grobid:header_Title"));
+ }
+}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf?rev=1696191&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJournalParser.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream