You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [4/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ tik...
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,80 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This is the main class for parsing SQLite3 files. When {@link #parse} is called,
+ * this creates a new {@link org.apache.tika.parser.jdbc.SQLite3DBParser}.
+ * <p/>
+ * Given potential conflicts of native libraries in web servers, users will
+ * need to add org.xerial's sqlite-jdbc jar to the class path for this parser
+ * to work. For development and testing, this jar is specified in tika-parsers'
+ * pom.xml, but it is currently set to "provided."
+ * <p/>
+ * Note that this family of jdbc parsers is designed to treat each CLOB and each BLOB
+ * as embedded documents.
+ */
+public class SQLite3Parser extends AbstractParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -752276948656079347L;
+
+ private static final MediaType MEDIA_TYPE = MediaType.application("x-sqlite3");
+
+ private final Set<MediaType> SUPPORTED_TYPES;
+
+ /**
+ * Checks to see if class is available for org.sqlite.JDBC.
+ * <p/>
+ * If not, this class will return an EMPTY_SET for getSupportedTypes()
+ */
+ public SQLite3Parser() {
+ Set<MediaType> tmp;
+ try {
+ Class.forName(SQLite3DBParser.SQLITE_CLASS_NAME);
+ tmp = Collections.singleton(MEDIA_TYPE);
+ } catch (ClassNotFoundException e) {
+ tmp = Collections.EMPTY_SET;
+ }
+ SUPPORTED_TYPES = tmp;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ SQLite3DBParser p = new SQLite3DBParser();
+ p.parse(stream, handler, metadata, context);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,109 @@
+package org.apache.tika.parser.jdbc;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Blob;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Locale;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Concrete class for SQLLite table parsing. This overrides
+ * column type handling from JDBCRowHandler.
+ * <p/>
+ * This class is not designed to be thread safe (because of DateFormat)!
+ * Need to call a new instance for each parse, as AbstractDBParser does.
+ * <p/>
+ * For now, this silently skips cells of type CLOB, because xerial's jdbc connector
+ * does not currently support them.
+ */
+class SQLite3TableReader extends JDBCTableReader {
+
+
+ DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd", Locale.ROOT);
+
+ public SQLite3TableReader(Connection connection, String tableName, ParseContext context) {
+ super(connection, tableName, context);
+ }
+
+
+ /**
+ * No-op for now in {@link SQLite3TableReader}.
+ *
+ * @param tableName
+ * @param fieldName
+ * @param rowNum
+ * @param resultSet
+ * @param columnIndex
+ * @param handler
+ * @param context
+ * @throws java.sql.SQLException
+ * @throws java.io.IOException
+ * @throws org.xml.sax.SAXException
+ */
+ @Override
+ protected void handleClob(String tableName, String fieldName, int rowNum,
+ ResultSet resultSet, int columnIndex,
+ ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
+ //no-op for now.
+ }
+
+ /**
+ * The jdbc connection to Sqlite does not yet implement blob, have to getBytes().
+ *
+ * @param resultSet resultSet
+ * @param columnIndex columnIndex for blob
+ * @return
+ * @throws java.sql.SQLException
+ */
+ @Override
+ protected InputStream getInputStreamFromBlob(ResultSet resultSet, int columnIndex, Blob blob, Metadata m) throws SQLException {
+ return TikaInputStream.get(resultSet.getBytes(columnIndex), m);
+ }
+
+ @Override
+ protected void handleInteger(String columnTypeName, ResultSet rs, int columnIndex,
+ ContentHandler handler) throws SQLException, SAXException {
+ //As of this writing, with xerial's sqlite jdbc connector, a timestamp is
+ //stored as a column of type Integer, but the columnTypeName is TIMESTAMP, and the
+ //value is a string representing a Long.
+ if (columnTypeName.equals("TIMESTAMP")) {
+ addAllCharacters(parseDateFromLongString(rs.getString(columnIndex)), handler);
+ } else {
+ addAllCharacters(Integer.toString(rs.getInt(columnIndex)), handler);
+ }
+
+ }
+
+ private String parseDateFromLongString(String longString) throws SAXException {
+ java.sql.Date d = new java.sql.Date(Long.parseLong(longString));
+ return dateFormat.format(d);
+
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#org.apache.tika.parser.jdbc.SQLite3DBParser
+org.apache.tika.parser.jdbc.SQLite3Parser
Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,356 @@
+package org.apache.tika.parser.jdbc;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class SQLite3ParserTest extends TikaTest {
+ private final static String TEST_FILE_NAME = "testSqlite3b.db";
+ private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME;
+
+ @Test
+ public void testBasic() throws Exception {
+ Parser p = new AutoDetectParser();
+
+ //test different types of input streams
+ //actual inputstream, memory buffered bytearray and literal file
+ InputStream[] streams = new InputStream[3];
+ streams[0] = getResourceAsStream(TEST_FILE1);
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(getResourceAsStream(TEST_FILE1), bos);
+ streams[1] = new ByteArrayInputStream(bos.toByteArray());
+ streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1));
+ int tests = 0;
+ for (InputStream stream : streams) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+ //1) getXML closes the stream
+ //2) getXML runs recursively on the contents, so the embedded docs should show up
+ XMLResult result = getXML(stream, p, metadata);
+ String x = result.xml;
+ //first table name
+ assertContains("<table name=\"my_table1\"><thead><tr>\t<th>INT_COL</th>", x);
+ //non-ascii
+ assertContains("<td>æ®ææ¯é¡¿å¤§å¦</td>", x);
+ //boolean
+ assertContains("<td>true</td>\t<td>2015-01-02</td>", x);
+ //date test
+ assertContains("2015-01-04", x);
+ //timestamp test
+ assertContains("2015-01-03 15:17:03", x);
+ //first embedded doc's image tag
+ assertContains("alt=\"image1.png\"", x);
+ //second embedded doc's image tag
+ assertContains("alt=\"A description...\"", x);
+ //second table name
+ assertContains("<table name=\"my_table2\"><thead><tr>\t<th>INT_COL2</th>", x);
+
+ Metadata post = result.metadata;
+ String[] tableNames = post.getValues(Database.TABLE_NAME);
+ assertEquals(2, tableNames.length);
+ assertEquals("my_table1", tableNames[0]);
+ assertEquals("my_table2", tableNames[1]);
+ tests++;
+ }
+ assertEquals(3, tests);
+ }
+
+ //make sure that table cells and rows are properly marked to
+ //yield \t and \n at the appropriate places
+ @Test
+ public void testSpacesInBodyContentHandler() throws Exception {
+ Parser p = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext ctx = new ParseContext();
+ ctx.set(Parser.class, p);
+ try (InputStream stream = getResourceAsStream(TEST_FILE1)) {
+ p.parse(stream, handler, metadata, ctx);
+ }
+ String s = handler.toString();
+ assertContains("0\t2.3\t2.4\tlorem", s);
+ assertContains("tempor\n", s);
+ }
+
+ //test what happens if the user forgets to pass in a parser via context
+ //to handle embedded documents
+ @Test
+ public void testNotAddingEmbeddedParserToParseContext() throws Exception {
+ Parser p = new AutoDetectParser();
+
+ InputStream is = getResourceAsStream(TEST_FILE1);
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+ ContentHandler handler = new ToXMLContentHandler();
+ p.parse(is, handler, metadata, new ParseContext());
+ String xml = handler.toString();
+ //just includes headers for embedded documents
+ assertContains("<table name=\"my_table1\"><thead><tr>", xml);
+ assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
+ //but no other content
+ assertNotContained("dog", xml);
+ assertNotContained("alt=\"image1.png\"", xml);
+ //second embedded doc's image tag
+ assertNotContained("alt=\"A description...\"", xml);
+ }
+
+ @Test
+ public void testRecursiveParserWrapper() throws Exception {
+ Parser p = new AutoDetectParser();
+
+ RecursiveParserWrapper wrapper =
+ new RecursiveParserWrapper(p, new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
+ InputStream is = getResourceAsStream(TEST_FILE1);
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+ wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext());
+ List<Metadata> metadataList = wrapper.getMetadata();
+ int i = 0;
+ assertEquals(5, metadataList.size());
+ //make sure the \t are inserted in a body handler
+
+ String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains("0\t2.3\t2.4\tlorem", table);
+ assertContains("æ®ææ¯é¡¿å¤§å¦", table);
+
+ //make sure the \n is inserted
+ String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains("do eiusmod tempor\n", table2);
+
+ assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+ //confirm .doc was added to blob
+ assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+ }
+
+ @Test
+ public void testParserContainerExtractor() throws Exception {
+ //There should be 6 embedded documents:
+ //2x tables -- UTF-8 csv representations of the tables
+ //2x word files, one doc and one docx
+ //2x png files, the same image embedded in each of the doc and docx
+
+ ParserContainerExtractor ex = new ParserContainerExtractor();
+ ByteCopyingHandler byteCopier = new ByteCopyingHandler();
+ InputStream is = getResourceAsStream(TEST_FILE1);
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+ ex.extract(TikaInputStream.get(is), ex, byteCopier);
+
+ assertEquals(4, byteCopier.bytes.size());
+ String[] strings = new String[4];
+ for (int i = 1; i < byteCopier.bytes.size(); i++) {
+ byte[] byteArr = byteCopier.bytes.get(i);
+ String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8);
+ strings[i] = s;
+ }
+ byte[] oleBytes = new byte[]{
+ (byte) -48,
+ (byte) -49,
+ (byte) 17,
+ (byte) -32,
+ (byte) -95,
+ (byte) -79,
+ (byte) 26,
+ (byte) -31,
+ (byte) 0,
+ (byte) 0,
+ };
+ //test OLE
+ for (int i = 0; i < 10; i++) {
+ assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]);
+ }
+ assertContains("PNG", strings[1]);
+ assertContains("PK", strings[2]);
+ assertContains("PNG", strings[3]);
+ }
+
+ //This confirms that reading the stream twice is not
+ //quadrupling the number of attachments.
+ @Test
+ public void testInputStreamReset() throws Exception {
+ //There should be 8 embedded documents:
+ //4x word files, two docs and two docxs
+ //4x png files, the same image embedded in each of the doc and docx
+
+ ParserContainerExtractor ex = new ParserContainerExtractor();
+ InputStreamResettingHandler byteCopier = new InputStreamResettingHandler();
+ InputStream is = getResourceAsStream(TEST_FILE1);
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
+ ex.extract(TikaInputStream.get(is), ex, byteCopier);
+ is.reset();
+ assertEquals(8, byteCopier.bytes.size());
+ }
+
+
+ public static class InputStreamResettingHandler implements EmbeddedResourceHandler {
+
+ public List<byte[]> bytes = new ArrayList<byte[]>();
+
+ @Override
+ public void handle(String filename, MediaType mediaType,
+ InputStream stream) {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ if (!stream.markSupported()) {
+ stream = TikaInputStream.get(stream);
+ }
+ stream.mark(1000000);
+ try {
+ IOUtils.copy(stream, os);
+ bytes.add(os.toByteArray());
+ stream.reset();
+ //now try again
+ os.reset();
+ IOUtils.copy(stream, os);
+ bytes.add(os.toByteArray());
+ stream.reset();
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ }
+
+ //code used for creating the test file
+/*
+ private Connection getConnection(String dbFileName) throws Exception {
+ File testDirectory = new File(this.getClass().getResource("/test-documents").toURI());
+ System.out.println("Writing to: " + testDirectory.getAbsolutePath());
+ File testDB = new File(testDirectory, dbFileName);
+ Connection c = null;
+ try {
+ Class.forName("org.sqlite.JDBC");
+ c = DriverManager.getConnection("jdbc:sqlite:" + testDB.getAbsolutePath());
+ } catch ( Exception e ) {
+ System.err.println( e.getClass().getName() + ": " + e.getMessage() );
+ System.exit(0);
+ }
+ return c;
+ }
+
+ @Test
+ public void testCreateDB() throws Exception {
+ Connection c = getConnection("testSQLLite3b.db");
+ Statement st = c.createStatement();
+ String sql = "DROP TABLE if exists my_table1";
+ st.execute(sql);
+ sql = "CREATE TABLE my_table1 (" +
+ "INT_COL INT PRIMARY KEY, "+
+ "FLOAT_COL FLOAT, " +
+ "DOUBLE_COL DOUBLE, " +
+ "CHAR_COL CHAR(30), "+
+ "VARCHAR_COL VARCHAR(30), "+
+ "BOOLEAN_COL BOOLEAN,"+
+ "DATE_COL DATE,"+
+ "TIME_STAMP_COL TIMESTAMP,"+
+ "BYTES_COL BYTES" +
+ ")";
+ st.execute(sql);
+ sql = "insert into my_table1 (INT_COL, FLOAT_COL, DOUBLE_COL, CHAR_COL, " +
+ "VARCHAR_COL, BOOLEAN_COL, DATE_COL, TIME_STAMP_COL, BYTES_COL) " +
+ "values (?,?,?,?,?,?,?,?,?)";
+ SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ java.util.Date d = f.parse("2015-01-03 15:17:03");
+ System.out.println(d.getTime());
+ long d1Long = 1420229823000L;// 2015-01-02 15:17:03
+ long d2Long = 1420316223000L;// 2015-01-03 15:17:03
+ PreparedStatement ps = c.prepareStatement(sql);
+ ps.setInt(1, 0);
+ ps.setFloat(2, 2.3f);
+ ps.setDouble(3, 2.4d);
+ ps.setString(4, "lorem");
+ ps.setString(5, "æ®ææ¯é¡¿å¤§å¦");
+ ps.setBoolean(6, true);
+ ps.setString(7, "2015-01-02");
+ ps.setString(8, "2015-01-03 15:17:03");
+// ps.setClob(9, new StringReader(clobString));
+ ps.setBytes(9, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.doc")));//contains "quick brown fox"
+ ps.executeUpdate();
+ ps.clearParameters();
+
+ ps.setInt(1, 1);
+ ps.setFloat(2, 4.6f);
+ ps.setDouble(3, 4.8d);
+ ps.setString(4, "dolor");
+ ps.setString(5, "sit");
+ ps.setBoolean(6, false);
+ ps.setString(7, "2015-01-04");
+ ps.setString(8, "2015-01-03 15:17:03");
+ //ps.setClob(9, new StringReader("consectetur adipiscing elit"));
+ ps.setBytes(9, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.docx")));//contains "The end!"
+
+ ps.executeUpdate();
+
+ //build table2
+ sql = "DROP TABLE if exists my_table2";
+ st.execute(sql);
+
+ sql = "CREATE TABLE my_table2 (" +
+ "INT_COL2 INT PRIMARY KEY, "+
+ "VARCHAR_COL2 VARCHAR(64))";
+ st.execute(sql);
+ sql = "INSERT INTO my_table2 values(0,'sed, do eiusmod tempor')";
+ st.execute(sql);
+ sql = "INSERT INTO my_table2 values(1,'incididunt \nut labore')";
+ st.execute(sql);
+
+ c.close();
+ }
+
+ private byte[] getByteArray(InputStream is) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ byte[] buff = new byte[1024];
+ for (int bytesRead; (bytesRead = is.read(buff)) != -1;) {
+ bos.write(buff, 0, bytesRead);
+ }
+ return bos.toByteArray();
+ }
+
+*/
+
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-ebook-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-ebook-module</artifactId>
+ <name>Apache Tika e-Book Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import javax.xml.XMLConstants;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+
+/**
+ * Parser for EPUB OPS <code>*.html</code> files.
+ *
+ * For the time being, assume XHTML (TODO: DTBook)
+ */
+public class EpubContentParser extends AbstractParser {
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.emptySet(); // not a top-level parser
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ final XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler,metadata);
+
+ try {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setValidating(false);
+ factory.setNamespaceAware(true);
+ try {
+ factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ } catch (SAXNotRecognizedException e) {
+ // TIKA-329: Some XML parsers do not support the secure-processing
+ // feature, even though it's required by JAXP in Java 5. Ignoring
+ // the exception is fine here, deployments without this feature
+ // are inherently vulnerable to XML denial-of-service attacks.
+ }
+ SAXParser parser = factory.newSAXParser();
+ parser.parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(xhtml));
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("XML parser configuration error", e);
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.xml.DcXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Epub parser
+ */
+public class EpubParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 215176772484050550L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("epub+zip"),
+ MediaType.application("x-ibooks+zip")
+ )));
+
+ private Parser meta = new DcXMLParser();
+
+ private Parser content = new EpubContentParser();
+
+ public Parser getMetaParser() {
+ return meta;
+ }
+
+ public void setMetaParser(Parser meta) {
+ this.meta = meta;
+ }
+
+ public Parser getContentParser() {
+ return content;
+ }
+
+ public void setContentParser(Parser content) {
+ this.content = content;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Because an EPub file is often made up of multiple XHTML files,
+ // we need explicit control over the start and end of the document
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ ContentHandler childHandler = new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml));
+
+ ZipInputStream zip = new ZipInputStream(stream);
+ ZipEntry entry = zip.getNextEntry();
+ while (entry != null) {
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, UTF_8);
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals("metadata.xml")) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith(".opf")) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith(".html") ||
+ entry.getName().endsWith(".xhtml")) {
+ content.parse(zip, childHandler, metadata, context);
+ }
+ entry = zip.getNextEntry();
+ }
+
+ // Finish everything
+ xhtml.endDocument();
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+org.apache.tika.parser.epub.EpubParser
Added: tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EpubParserTest {
+
+ @Test
+ public void testXMLParser() throws Exception {
+ try (InputStream input = EpubParserTest.class.getResourceAsStream(
+ "/test-documents/testEPUB.epub")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new EpubParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/epub+zip",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("en",
+ metadata.get(TikaCoreProperties.LANGUAGE));
+ assertEquals("This is an ePub test publication for Tika.",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Apache",
+ metadata.get(TikaCoreProperties.PUBLISHER));
+
+ String content = handler.toString();
+ assertContains("Plus a simple div", content);
+ assertContains("First item", content);
+ assertContains("The previous headings were subchapters", content);
+ assertContains("Table data", content);
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-ebook-module/src/test/java/org/apache/tika/parser/ibooks/iBooksParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ibooks;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.epub.EpubParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class iBooksParserTest {
+
+ @Test
+ public void testiBooksParser() throws Exception {
+ try (InputStream input = iBooksParserTest.class.getResourceAsStream(
+ "/test-documents/testiBooks.ibooks")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new EpubParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/x-ibooks+zip",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("en-GB",
+ metadata.get(TikaCoreProperties.LANGUAGE));
+ assertEquals("iBooks Author v1.0",
+ metadata.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("Apache",
+ metadata.get(TikaCoreProperties.CREATOR));
+
+ /* TODO For some reason, the xhtml files in iBooks-style ePub are not parsed properly, and the content comes back empty.git che
+ String content = handler.toString();
+ System.out.println("content="+content);
+ assertContains("Plus a simple div", content);
+ assertContains("First item", content);
+ assertContains("The previous headings were subchapters", content);
+ assertContains("Table data", content);
+ assertContains("Lorem ipsum dolor rutur amet", content);
+ */
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-journal-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-journal-module</artifactId>
+ <name>Apache Tika Journal Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <cxf.version>3.0.3</cxf.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-rs-client</artifactId>
+ <version>${cxf.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.json</groupId>
+ <artifactId>json</artifactId>
+ <version>20140107</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-pdf-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Properties;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+
+public class GrobidRESTParser {
+
+ private static final String GROBID_REST_HOST = "http://localhost:8080";
+
+ private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
+ // doesn't work
+ // nfc why
+
+ private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+
+ private String restHostUrlStr;
+
+ public GrobidRESTParser() {
+ String restHostUrlStr = null;
+ try {
+ restHostUrlStr = readRestUrl();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (restHostUrlStr == null
+ || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+ this.restHostUrlStr = GROBID_REST_HOST;
+ } else {
+ this.restHostUrlStr = restHostUrlStr;
+ }
+ }
+
+ public void parse(String filePath, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws FileNotFoundException {
+
+ File pdfFile = new File(filePath);
+ ContentDisposition cd = new ContentDisposition(
+ "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\"");
+ Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
+ MultipartBody body = new MultipartBody(att);
+
+ Response response = WebClient
+ .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
+ .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+ .post(body);
+
+ try {
+ String resp = response.readEntity(String.class);
+ Metadata teiMet = new TEIParser().parse(resp);
+ for (String key : teiMet.names()) {
+ metadata.add("grobid:header_" + key, teiMet.get(key));
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static String readRestUrl() throws IOException {
+ Properties grobidProperties = new Properties();
+ grobidProperties.load(GrobidRESTParser.class
+ .getResourceAsStream("GrobidExtractor.properties"));
+
+ return grobidProperties.getProperty("grobid.server.url");
+ }
+
+ protected static boolean canRun() {
+ Response response = null;
+
+ try {
+ response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH)
+ .accept(MediaType.TEXT_HTML).get();
+ String resp = response.readEntity(String.class);
+ return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+ } catch (Exception e) {
+ e.printStackTrace();
+ return false;
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JournalParser extends AbstractParser {
+
+ /**
+ * Generated serial ID
+ */
+ private static final long serialVersionUID = 4664255544154296438L;
+
+ private static final MediaType TYPE = MediaType.application("pdf");
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+ File tmpFile = tis.getFile();
+
+ GrobidRESTParser grobidParser = new GrobidRESTParser();
+ grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);
+
+ PDFParser parser = new PDFParser();
+ parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/java/org/apache/tika/parser/journal/TEIParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,893 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.json.XML;
+
+public class TEIParser {
+
+ public TEIParser() {
+ }
+
+ public Metadata parse(String source) {
+ JSONObject obj = XML.toJSONObject(source);
+ Metadata metadata = new Metadata();
+ createGrobidMetadata(source, obj, metadata);
+ return metadata;
+ }
+
+ private void createGrobidMetadata(String source, JSONObject obj,
+ Metadata metadata) {
+ if (obj != null) {
+ JSONObject teiHeader = obj.getJSONObject("TEI")
+ .getJSONObject("teiHeader");
+ if (teiHeader.has("text")) {
+ parseText(teiHeader.getJSONObject("text"), metadata);
+ }
+
+ if (teiHeader.has("fileDesc")) {
+ parseFileDesc(teiHeader.getJSONObject("fileDesc"), metadata);
+
+ }
+ if (teiHeader.has("profileDesc")) {
+ parseProfileDesc(teiHeader.getJSONObject("profileDesc"), metadata);
+ }
+ }
+
+ addStaticMet(source, obj, metadata);
+ }
+
+ private void addStaticMet(String source, JSONObject obj, Metadata metadata) {
+ metadata.add("Class", Metadata.class.getName());
+ metadata.add("TEIJSONSource", obj.toString());
+ metadata.add("TEIXMLSource", source);
+ }
+
+ private void parseText(JSONObject text, Metadata metadata) {
+ if (text.has("xml:lang")) {
+ metadata.add("Language", text.getString("xml:lang"));
+ }
+ }
+
+ private void parseFileDesc(JSONObject fileDesc, Metadata metadata) {
+ if (fileDesc.has("titleStmt")) {
+ parseTitleStmt(fileDesc.getJSONObject("titleStmt"), metadata);
+ }
+
+ if (fileDesc.has("sourceDesc")) {
+ parseSourceDesc(fileDesc.getJSONObject("sourceDesc"), metadata);
+ }
+ }
+
+ private void parseTitleStmt(JSONObject titleStmt, Metadata metadata) {
+ if (titleStmt.has("title")) {
+ JSONObject title = titleStmt.getJSONObject("title");
+ if (title.has("content")) {
+ metadata.add("Title", title.getString("content"));
+ }
+ }
+ }
+
+ private void parseSourceDesc(JSONObject sourceDesc, Metadata metadata) {
+ if (sourceDesc.has("biblStruct")) {
+ parseBiblStruct(sourceDesc.getJSONObject("biblStruct"), metadata);
+ }
+ }
+
+ private void parseBiblStruct(JSONObject biblStruct, Metadata metadata) {
+ if (biblStruct.has("analytic")
+ && biblStruct.get("analytic") instanceof JSONObject) {
+ JSONObject analytic = biblStruct.getJSONObject("analytic");
+ if (analytic.has("author")) {
+ Object authorObj = analytic.get("author");
+
+ List<Author> authorList = new ArrayList<Author>();
+ if (authorObj instanceof JSONObject) {
+ parseAuthor((JSONObject) authorObj, authorList);
+ } else if (authorObj instanceof JSONArray) {
+ JSONArray authors = (JSONArray) authorObj;
+ if (authors.length() > 0) {
+ for (int i = 0; i < authors.length(); i++) {
+ JSONObject author = authors.getJSONObject(i);
+ parseAuthor(author, authorList);
+ }
+ }
+
+ metadata.add("Address", getMetadataAddresses(authorList));
+ metadata.add("Affiliation", getMetadataAffiliations(authorList));
+ metadata.add("Authors", getMetadataAuthors(authorList));
+ metadata.add("FullAffiliations",
+ getMetadataFullAffiliations(authorList));
+ }
+
+ }
+ } else {
+ metadata.add("Error", "Unable to parse: no analytic section in JSON");
+ }
+
+ }
+
+ private String getMetadataFullAffiliations(List<Author> authorList) {
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAffils = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+ metAffils.append("[");
+ for (Affiliation af : unique) {
+ metAffils.append(af.toString());
+ metAffils.append(",");
+ }
+ metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1));
+ metAffils.append("]");
+ return metAffils.toString();
+ }
+
+ private String getMetadataAuthors(List<Author> authorList) {
+ // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2
+ // Steve Hughes 1
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAuthors = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+
+ for (Author a : authorList) {
+ metAuthors.append(printOrBlank(a.getFirstName()));
+ metAuthors.append(printOrBlank(a.getMiddleName()));
+ metAuthors.append(printOrBlank(a.getSurName()));
+
+ StringBuilder affilBuilder = new StringBuilder();
+ for (int idx = 0; idx < unique.size(); idx++) {
+ Affiliation af = unique.get(idx);
+ if (a.getAffiliations().contains(af)) {
+ affilBuilder.append((idx + 1));
+ affilBuilder.append(",");
+ }
+ }
+
+ if (affilBuilder.length() > 0)
+ affilBuilder.deleteCharAt(affilBuilder.length() - 1);
+
+ metAuthors.append(affilBuilder.toString());
+ metAuthors.append(" ");
+ }
+
+ return metAuthors.toString();
+ }
+
+ private String getMetadataAffiliations(List<Author> authorList) {
+ // generates 1 Jet Propulsion Laboratory California Institute of Technology
+ // ; 2 Computer Science Department University of Southern California
+ List<Affiliation> unique = new ArrayList<Affiliation>();
+ StringBuilder metAffil = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af)) {
+ unique.add(af);
+ }
+ }
+ }
+
+ int count = 1;
+ for (Affiliation a : unique) {
+ metAffil.append(count);
+ metAffil.append(" ");
+ metAffil.append(a.getOrgName().toString());
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ metAffil.append("; ");
+ count++;
+ }
+
+ if (count > 1) {
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ metAffil.deleteCharAt(metAffil.length() - 1);
+ }
+
+ return metAffil.toString();
+ }
+
+ private String getMetadataAddresses(List<Author> authorList) {
+ // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA",
+ List<Address> unique = new ArrayList<Address>();
+ StringBuilder metAddress = new StringBuilder();
+
+ for (Author a : authorList) {
+ for (Affiliation af : a.getAffiliations()) {
+ if (!unique.contains(af.getAddress())) {
+ unique.add(af.getAddress());
+ }
+ }
+ }
+
+ for (Address ad : unique) {
+ metAddress.append(ad.toString());
+ metAddress.append(" ");
+ }
+
+ return metAddress.toString();
+ }
+
+ private void parseAuthor(JSONObject authorObj, List<Author> authorList) {
+ Author author = new Author();
+
+ if (authorObj.has("persName")) {
+ JSONObject persName = authorObj.getJSONObject("persName");
+
+ if (persName.has("forename")) {
+
+ Object foreNameObj = persName.get("forename");
+
+ if (foreNameObj instanceof JSONObject) {
+ parseNamePart((JSONObject) foreNameObj, author);
+ } else if (foreNameObj instanceof JSONArray) {
+ JSONArray foreName = persName.getJSONArray("forename");
+
+ if (foreName.length() > 0) {
+ for (int i = 0; i < foreName.length(); i++) {
+ JSONObject namePart = foreName.getJSONObject(i);
+ parseNamePart(namePart, author);
+ }
+ }
+ }
+ }
+
+ if (persName.has("surname")) {
+ author.setSurName(persName.getString("surname"));
+ }
+
+ if (authorObj.has("affiliation")) {
+ parseAffiliation(authorObj.get("affiliation"), author);
+ }
+
+ }
+
+ authorList.add(author);
+ }
+
+ private void parseNamePart(JSONObject namePart, Author author) {
+ if (namePart.has("type") && namePart.has("content")) {
+ String type = namePart.getString("type");
+ String content = namePart.getString("content");
+
+ if (type.equals("first")) {
+ author.setFirstName(content);
+ }
+
+ if (type.equals("middle")) {
+ author.setMiddleName(content);
+ }
+ }
+ }
+
+ private void parseAffiliation(Object affiliationJSON, Author author) {
+ if (affiliationJSON instanceof JSONObject) {
+ parseOneAffiliation((JSONObject) affiliationJSON, author);
+ } else if (affiliationJSON instanceof JSONArray) {
+ JSONArray affiliationArray = (JSONArray) affiliationJSON;
+ if (affiliationArray != null && affiliationArray.length() > 0) {
+ for (int i = 0; i < affiliationArray.length(); i++) {
+ JSONObject affiliationObj = affiliationArray.getJSONObject(i);
+ parseOneAffiliation(affiliationObj, author);
+ }
+ }
+ }
+ }
+
+ private void parseOneAffiliation(JSONObject affiliationObj, Author author) {
+
+ Affiliation affiliation = new Affiliation();
+ if (affiliationObj.has("address")) {
+ parseAddress(affiliationObj.getJSONObject("address"), affiliation);
+ }
+
+ if (affiliationObj.has("orgName")) {
+ OrgName orgName = new OrgName();
+ Object orgObject = affiliationObj.get("orgName");
+ if (orgObject instanceof JSONObject) {
+ parseOrgName((JSONObject) orgObject, orgName);
+ } else if (orgObject instanceof JSONArray) {
+ JSONArray orgNames = (JSONArray) orgObject;
+ if (orgNames != null && orgNames.length() > 0) {
+ for (int i = 0; i < orgNames.length(); i++) {
+ parseOrgName(orgNames.getJSONObject(i), orgName);
+ }
+ }
+
+ affiliation.setOrgName(orgName);
+ }
+
+ }
+
+ author.getAffiliations().add(affiliation);
+ }
+
+ private void parseAddress(JSONObject addressObj, Affiliation affiliation) {
+ Address address = new Address();
+
+ if (addressObj.has("region")) {
+ address.setRegion(addressObj.getString("region"));
+ }
+
+ if (addressObj.has("postCode")) {
+ address.setPostCode(JSONObject.valueToString(addressObj.get("postCode")));
+ }
+
+ if (addressObj.has("settlement")) {
+ address.setSettlment(addressObj.getString("settlement"));
+ }
+
+ if (addressObj.has("country")) {
+ Country country = new Country();
+ Object countryObj = addressObj.get("country");
+
+ if (countryObj instanceof JSONObject) {
+ JSONObject countryJson = addressObj.getJSONObject("country");
+
+ if (countryJson.has("content")) {
+ country.setContent(countryJson.getString("content"));
+ }
+
+ if (countryJson.has("key")) {
+ country.setKey(countryJson.getString("key"));
+ }
+ } else if (countryObj instanceof String) {
+ country.setContent((String) countryObj);
+ }
+ address.setCountry(country);
+ }
+
+ affiliation.setAddress(address);
+ }
+
+ private void parseOrgName(JSONObject orgObj, OrgName orgName) {
+ OrgTypeName typeName = new OrgTypeName();
+ if (orgObj.has("content")) {
+ typeName.setName(orgObj.getString("content"));
+ }
+
+ if (orgObj.has("type")) {
+ typeName.setType(orgObj.getString("type"));
+ }
+
+ orgName.getTypeNames().add(typeName);
+ }
+
+ private void parseProfileDesc(JSONObject profileDesc, Metadata metadata) {
+ if (profileDesc.has("abstract")) {
+ if (profileDesc.has("p")) {
+ metadata.add("Abstract", profileDesc.getString("p"));
+ }
+ }
+
+ if (profileDesc.has("textClass")) {
+ JSONObject textClass = profileDesc.getJSONObject("textClass");
+
+ if (textClass.has("keywords")) {
+ Object keywordsObj = textClass.get("keywords");
+ // test AJ15.pdf
+ if (keywordsObj instanceof String) {
+ metadata.add("Keyword", (String) keywordsObj);
+ } else if (keywordsObj instanceof JSONObject) {
+ JSONObject keywords = textClass.getJSONObject("keywords");
+ if (keywords.has("term")) {
+ JSONArray termArr = keywords.getJSONArray("term");
+ for (int i = 0; i < termArr.length(); i++) {
+ metadata.add("Keyword", JSONObject.valueToString(termArr.get(i)));
+ }
+ }
+ }
+
+ }
+ }
+
+ }
+
+ private String printOrBlank(String val) {
+ if (val != null && !val.equals("")) {
+ return val + " ";
+ } else
+ return " ";
+ }
+
+ class Author {
+
+ private String surName;
+
+ private String middleName;
+
+ private String firstName;
+
+ private List<Affiliation> affiliations;
+
+ public Author() {
+ this.surName = null;
+ this.middleName = null;
+ this.firstName = null;
+ this.affiliations = new ArrayList<Affiliation>();
+ }
+
+ /**
+ * @return the surName
+ */
+ public String getSurName() {
+ return surName;
+ }
+
+ /**
+ * @param surName
+ * the surName to set
+ */
+ public void setSurName(String surName) {
+ this.surName = surName;
+ }
+
+ /**
+ * @return the middleName
+ */
+ public String getMiddleName() {
+ return middleName;
+ }
+
+ /**
+ * @param middleName
+ * the middleName to set
+ */
+ public void setMiddleName(String middleName) {
+ this.middleName = middleName;
+ }
+
+ /**
+ * @return the firstName
+ */
+ public String getFirstName() {
+ return firstName;
+ }
+
+ /**
+ * @param firstName
+ * the firstName to set
+ */
+ public void setFirstName(String firstName) {
+ this.firstName = firstName;
+ }
+
+ /**
+ * @return the affiliations
+ */
+ public List<Affiliation> getAffiliations() {
+ return affiliations;
+ }
+
+ /**
+ * @param affiliations
+ * the affiliations to set
+ */
+ public void setAffiliations(List<Affiliation> affiliations) {
+ this.affiliations = affiliations;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName
+ : "" + ", firstName=" + firstName + ", affiliations=" + affiliations
+ + "]";
+ }
+
+ }
+
+ class Affiliation {
+
+ private OrgName orgName;
+
+ private Address address;
+
+ public Affiliation() {
+ this.orgName = new OrgName();
+ this.address = new Address();
+ }
+
+ /**
+ * @return the orgName
+ */
+ public OrgName getOrgName() {
+ return orgName;
+ }
+
+ /**
+ * @param orgName
+ * the orgName to set
+ */
+ public void setOrgName(OrgName orgName) {
+ this.orgName = orgName;
+ }
+
+ /**
+ * @return the address
+ */
+ public Address getAddress() {
+ return address;
+ }
+
+ /**
+ * @param address
+ * the address to set
+ */
+ public void setAddress(Address address) {
+ this.address = address;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Affiliation otherA = (Affiliation) obj;
+ return this.getAddress().equals(otherA.getAddress())
+ && this.getOrgName().equals(otherA.getOrgName());
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "Affiliation {orgName=" + orgName + ", address=" + address + "}";
+ }
+
+ }
+
+ class OrgName {
+ private List<OrgTypeName> typeNames;
+
+ public OrgName() {
+ this.typeNames = new ArrayList<OrgTypeName>();
+ }
+
+ /**
+ * @return the typeNames
+ */
+ public List<OrgTypeName> getTypeNames() {
+ return typeNames;
+ }
+
+ /**
+ * @param typeNames
+ * the typeNames to set
+ */
+ public void setTypeNames(List<OrgTypeName> typeNames) {
+ this.typeNames = typeNames;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ for (OrgTypeName on : this.typeNames) {
+ builder.append(on.getName());
+ builder.append(" ");
+ }
+ return builder.toString();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ OrgName otherA = (OrgName) obj;
+
+ if (otherA.getTypeNames() != null) {
+ if (this.typeNames == null) {
+ return false;
+ } else {
+ return this.typeNames.size() == otherA.getTypeNames().size();
+ }
+ } else {
+ if (this.typeNames == null) {
+ return true;
+ } else
+ return false;
+ }
+
+ }
+
+ }
+
+ class OrgTypeName {
+ private String name;
+ private String type;
+
+ public OrgTypeName() {
+ this.name = null;
+ this.type = null;
+ }
+
+ /**
+ * @return the name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @param name
+ * the name to set
+ */
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * @return the type
+ */
+ public String getType() {
+ return type;
+ }
+
+ /**
+ * @param type
+ * the type to set
+ */
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ OrgTypeName otherOrgName = (OrgTypeName) obj;
+ return this.type.equals(otherOrgName.getType())
+ && this.name.equals(otherOrgName.getName());
+ }
+
+ }
+
+ private class Address {
+
+ private String region;
+ private String postCode;
+ private String settlment;
+ private Country country;
+
+ public Address() {
+ this.region = null;
+ this.postCode = null;
+ this.settlment = null;
+ this.country = new Country();
+ }
+
+ /**
+ * @return the region
+ */
+ public String getRegion() {
+ return region;
+ }
+
+ /**
+ * @param region
+ * the region to set
+ */
+ public void setRegion(String region) {
+ this.region = region;
+ }
+
+ /**
+ * @return the postCode
+ */
+ public String getPostCode() {
+ return postCode;
+ }
+
+ /**
+ * @param postCode
+ * the postCode to set
+ */
+ public void setPostCode(String postCode) {
+ this.postCode = postCode;
+ }
+
+ /**
+ * @return the settlment
+ */
+ public String getSettlment() {
+ return settlment;
+ }
+
+ /**
+ * @param settlment
+ * the settlment to set
+ */
+ public void setSettlment(String settlment) {
+ this.settlment = settlment;
+ }
+
+ /**
+ * @return the country
+ */
+ public Country getCountry() {
+ return country;
+ }
+
+ /**
+ * @param country
+ * the country to set
+ */
+ public void setCountry(Country country) {
+ this.country = country;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Address otherA = (Address) obj;
+ if (this.settlment == null) {
+ return otherA.getSettlment() == null;
+ } else if (this.country == null) {
+ return otherA.getCountry() == null;
+ } else if (this.postCode == null) {
+ return otherA.getPostCode() == null;
+ } else if (this.region == null) {
+ return otherA.getRegion() == null;
+ }
+
+ return this.settlment.equals(otherA.getSettlment())
+ && this.country.equals(otherA.getCountry())
+ && this.postCode.equals(otherA.getPostCode())
+ && this.region.equals(otherA.getRegion());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(settlment);
+ builder.append(", ");
+ builder.append(region);
+ builder.append(" ");
+ builder.append(postCode);
+ builder.append(" ");
+ builder.append(country.getContent());
+ return builder.toString();
+ }
+ }
+
+ private class Country {
+ private String key;
+ private String content;
+
+ public Country() {
+ this.key = null;
+ this.content = null;
+ }
+
+ /**
+ * @return the key
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
+ * @param key
+ * the key to set
+ */
+ public void setKey(String key) {
+ this.key = key;
+ }
+
+ /**
+ * @return the content
+ */
+ public String getContent() {
+ return content;
+ }
+
+ /**
+ * @param content
+ * the content to set
+ */
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ @Override
+ public boolean equals(Object obj) {
+ Country otherC = (Country) obj;
+
+ if (this.key == null) {
+ if (otherC.getKey() != null) {
+ return false;
+ } else {
+ if (this.content == null) {
+ if (otherC.getContent() != null) {
+ return false;
+ } else {
+ return true;
+ }
+ } else {
+ return content.equals(otherC.getContent());
+ }
+ }
+ } else {
+ if (this.content == null) {
+ if (otherC.getContent() != null) {
+ return false;
+ } else {
+ return this.key.equals(otherC.getKey());
+ }
+ } else {
+ return this.key.equals(otherC.getKey())
+ && this.content.equals(otherC.getContent());
+ }
+ }
+ }
+
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#org.apache.tika.parser.journal.GrobidRESTParser
+org.apache.tika.parser.journal.JournalParser
+#org.apache.tika.parser.journal.TEIParser
Added: tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties Wed Jan 6 03:50:50 2016
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+grobid.server.url=http://localhost:8080
Added: tika/branches/2.x/tika-parser-modules/tika-journal-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-journal-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-journal-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-journal-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.apache.tika.parser.journal.GrobidRESTParser.canRun;
+import java.io.InputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class JournalParserTest {
+
+ @Test
+ public void testJournalParser() {
+ String path = "/test-documents/testJournalParser.pdf";
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ assumeTrue(canRun());
+
+ InputStream stream = JournalParserTest.class.getResourceAsStream(path);
+ JournalParser jParser = new JournalParser();
+ try {
+ jParser.parse(stream, handler, metadata, new ParseContext());
+ } catch (Exception e){
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+
+ assertNotNull(metadata.get("grobid:header_Title"));
+ }
+}
Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml?rev=1723223&r1=1723222&r2=1723223&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -94,10 +94,33 @@
<version>${pdfbox.version}</version>
</dependency>
<dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>fontbox</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>${commons.logging.version}</version>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-web-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-pdf-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-office-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>