You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/30 17:43:06 UTC

[tika] 03/05: TIKA-1804 -- convert json parsing to SAX in TEIParser, step 1: test current output.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b290cd79d652741a0c8249f3860583b4169f6455
Author: tballison <ta...@mitre.org>
AuthorDate: Thu May 25 09:41:38 2017 -0400

    TIKA-1804 -- convert json parsing to SAX in TEIParser, step 1: test current output.
---
 .../org/apache/tika/parser/journal/TEITest.java    |  67 +++++++++++
 .../src/test/resources/test-documents/testTEI.xml  | 134 +++++++++++++++++++++
 2 files changed, 201 insertions(+)

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/journal/TEITest.java b/tika-parsers/src/test/java/org/apache/tika/parser/journal/TEITest.java
new file mode 100644
index 0000000..0c456de
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/journal/TEITest.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.journal;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class TEITest extends TikaTest {
+
+    @Test
+    public void testCurrent() throws Exception {
+        TEIParser teiParser = new TEIParser();
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (InputStream is = getResourceAsStream("/test-documents/testTEI.xml")) {
+            IOUtils.copy(is, bos);
+        }
+        String xml = new String (bos.toByteArray(), StandardCharsets.UTF_8);
+        Metadata metadata = teiParser.parse(xml);
+        assertEquals("Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+                "Montbonnot Saint-Martin, null \"38330, 38330, 38330, 38330\" " +
+                "France, France, France, France ", metadata.get("Address"));
+        String[] keywords = new String[]{
+                "\"F22 [Analysis of Algorithms and Problem Complexity]: Nonnumerical Algorithms and Problems\\u2014Sequencing\"",
+                "\"and scheduling; D41 [Operating Systems]: Process management\\u2014Scheduling, Concurrency\"",
+                "\"Keywords\"",
+                "\"Parallel Computing, Algorithms, Scheduling, Parallel Tasks,\"",
+                "\"Moldable Tasks, Bi-criteria\""
+        };
+        assertArrayEquals(keywords, metadata.getValues("Keyword"));
+        assertEquals("Pierre-François  Dutot 1 Lionel  Eyraud 1 Grégory  Gr´ 1 Grégory  Mouní 1 Denis  Trystram 1 ",
+                metadata.get("Authors"));
+        assertEquals("Bi-criteria Algorithm for Scheduling Jobs on Cluster Platforms *",
+                metadata.get("Title"));
+        assertEquals("1 ID-IMAG ID-IMAG ID-IMAG ID-IMAG", metadata.get("Affiliation"));
+        assertEquals("[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " +
+                        "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+                        "null \"38330, 38330, 38330, 38330\" France, France, France, France}" +
+                        "[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " +
+                        "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
+                        "null \"38330, 38330, 38330, 38330\" France, France, France, France}]",
+                metadata.get("FullAffiliations"));
+    }
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testTEI.xml b/tika-parsers/src/test/resources/test-documents/testTEI.xml
new file mode 100644
index 0000000..dcba596
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testTEI.xml
@@ -0,0 +1,134 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+     xsi:schemaLocation="http://www.tei-c.org/ns/1.0 C:\grobid\grobid-home/schemas/xsd/Grobid.xsd"
+     xmlns:xlink="http://www.w3.org/1999/xlink">
+    <teiHeader xml:lang="en">
+        <encodingDesc>
+            <appInfo>
+                <application version="0.4.2-SNAPSHOT" ident="GROBID" when="2017-05-25T13:02+0000">
+                    <ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+                </application>
+            </appInfo>
+        </encodingDesc>
+        <fileDesc>
+            <titleStmt>
+                <title level="a" type="main">Bi-criteria Algorithm for Scheduling Jobs on Cluster Platforms *</title>
+            </titleStmt>
+            <publicationStmt>
+                <publisher/>
+                <availability status="unknown"><licence/></availability>
+                <date>June 27–30, 2004</date>
+            </publicationStmt>
+            <sourceDesc>
+                <biblStruct>
+                    <analytic>
+                        <author>
+                            <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Pierre-François</forename><surname>Dutot</surname></persName>
+                            <email>pfdutot@imag.fr</email>
+                            <affiliation key="aff0">
+                                <orgName type="department" key="dep1">ID-IMAG</orgName>
+                                <orgName type="department" key="dep2">ID-IMAG</orgName>
+                                <orgName type="department" key="dep3">ID-IMAG</orgName>
+                                <orgName type="institution">ID-IMAG</orgName>
+                                <address>
+                                    <addrLine>51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann</addrLine>
+                                    <postCode>38330, 38330, 38330, 38330</postCode>
+                                    <settlement>Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin</settlement>
+                                    <country>France, France, France, France</country>
+                                </address>
+                            </affiliation>
+                        </author>
+                        <author>
+                            <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Lionel</forename><surname>Eyraud</surname></persName>
+                            <email>Lionel.Eyraud@imag.fr</email>
+                            <affiliation key="aff0">
+                                <orgName type="department" key="dep1">ID-IMAG</orgName>
+                                <orgName type="department" key="dep2">ID-IMAG</orgName>
+                                <orgName type="department" key="dep3">ID-IMAG</orgName>
+                                <orgName type="institution">ID-IMAG</orgName>
+                                <address>
+                                    <addrLine>51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann</addrLine>
+                                    <postCode>38330, 38330, 38330, 38330</postCode>
+                                    <settlement>Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin</settlement>
+                                    <country>France, France, France, France</country>
+                                </address>
+                            </affiliation>
+                        </author>
+                        <author>
+                            <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Grégory</forename><surname>Gr´</surname></persName>
+                            <affiliation key="aff0">
+                                <orgName type="department" key="dep1">ID-IMAG</orgName>
+                                <orgName type="department" key="dep2">ID-IMAG</orgName>
+                                <orgName type="department" key="dep3">ID-IMAG</orgName>
+                                <orgName type="institution">ID-IMAG</orgName>
+                                <address>
+                                    <addrLine>51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann</addrLine>
+                                    <postCode>38330, 38330, 38330, 38330</postCode>
+                                    <settlement>Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin</settlement>
+                                    <country>France, France, France, France</country>
+                                </address>
+                            </affiliation>
+                        </author>
+                        <author>
+                            <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Grégory</forename><surname>Mouní</surname></persName>
+                            <email>Gregory.Mounie@imag.fr</email>
+                            <affiliation key="aff0">
+                                <orgName type="department" key="dep1">ID-IMAG</orgName>
+                                <orgName type="department" key="dep2">ID-IMAG</orgName>
+                                <orgName type="department" key="dep3">ID-IMAG</orgName>
+                                <orgName type="institution">ID-IMAG</orgName>
+                                <address>
+                                    <addrLine>51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann</addrLine>
+                                    <postCode>38330, 38330, 38330, 38330</postCode>
+                                    <settlement>Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin</settlement>
+                                    <country>France, France, France, France</country>
+                                </address>
+                            </affiliation>
+                        </author>
+                        <author>
+                            <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Denis</forename><surname>Trystram</surname></persName>
+                            <email>Denis.Trystram@imag.fr</email>
+                            <affiliation key="aff0">
+                                <orgName type="department" key="dep1">ID-IMAG</orgName>
+                                <orgName type="department" key="dep2">ID-IMAG</orgName>
+                                <orgName type="department" key="dep3">ID-IMAG</orgName>
+                                <orgName type="institution">ID-IMAG</orgName>
+                                <address>
+                                    <addrLine>51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann, 51 avenue Jean Kuntzmann</addrLine>
+                                    <postCode>38330, 38330, 38330, 38330</postCode>
+                                    <settlement>Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin</settlement>
+                                    <country>France, France, France, France</country>
+                                </address>
+                            </affiliation>
+                        </author>
+                        <title level="a" type="main">Bi-criteria Algorithm for Scheduling Jobs on Cluster Platforms *</title>
+                    </analytic>
+                    <monogr>
+                        <meeting> <address><addrLine>Barcelona, Spain</addrLine></address>
+                        </meeting>
+                        <imprint>
+                            <date type="published">June 27–30, 2004</date>
+                        </imprint>
+                    </monogr>
+                    <note>Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. ACM 1-58113-840-7/04/0006 ...$5.00.</note>
+                </biblStruct>
+            </sourceDesc>
+        </fileDesc>
+        <profileDesc>
+            <textClass>
+                <keywords>
+                    <term>F22 [Analysis of Algorithms and Problem Complexity]: Nonnumerical Algorithms and Problems—Sequencing</term>
+                    <term>and scheduling; D41 [Operating Systems]: Process management—Scheduling, Concurrency</term>
+                    <term>Keywords</term>
+                    <term>Parallel Computing, Algorithms, Scheduling, Parallel Tasks,</term>
+                    <term>Moldable Tasks, Bi-criteria</term>
+                </keywords>
+            </textClass>
+            <abstract>
+                <p>We describe in this paper a new method for building an efficient algorithm for scheduling jobs in a cluster. Jobs are considered as parallel tasks (PT) which can be scheduled on any number of processors. The main feature is to consider two criteria that are optimized together. These criteria are the makespan and the weighted minimal average completion time (minsum). They are chosen for their complementarity, to be able to represent both user-oriented objectives and sys [...]
+            </abstract>
+        </profileDesc>
+    </teiHeader>
+    <text xml:lang="en">
+    </text>
+</TEI>
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.