You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2019/12/19 00:14:47 UTC

[tika] branch branch_1x updated (f8b9ca2 -> 40560e9)

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from f8b9ca2  TIKA-3012 -- fix unit test for 1.x branch
     new 833d0c7  TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler
     new 40560e9  TIKA-3014: Updated CHANGES.txt

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |  2 ++
 .../tika/parser/xliff/XLIFF12ContentHandler.java   | 25 ++++++++++++++++++----
 .../apache/tika/parser/xliff/XLIFF12Parser.java    |  2 ++
 .../tika/parser/xliff/XLIFF12ParserTest.java       | 10 +++++++--
 4 files changed, 33 insertions(+), 6 deletions(-)


[tika] 02/02: TIKA-3014: Updated CHANGES.txt

Posted by dm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 40560e9c6e43edf751e03ac0289c3adbb17397da
Author: David Meikle <dm...@apache.org>
AuthorDate: Wed Dec 18 23:52:19 2019 +0000

    TIKA-3014: Updated CHANGES.txt
    
    (cherry picked from commit 3bdd64b126d867b414b00a62744a69f2e57ad89d)
---
 CHANGES.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8fb14c8..e62bd64 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -4,6 +4,8 @@ Release 1.24 - ???
    
    * Upgrade to java-libpst 0.9.3 (TIKA-2546).
 
+   * Fixed XLIFF12Parser failures with ToXMLHandler (TIKA-3014).
+
 Release 1.23 - 12/02/2019
 
    * NOTE: The PDFParser now relies on OCRDPI to render page images when


[tika] 01/02: TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler

Posted by dm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 833d0c738acacfc8ca295bfa7451bf427ee5c2ee
Author: David Meikle <da...@meikle.io>
AuthorDate: Wed Dec 18 23:21:28 2019 +0000

    TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler
    
    (cherry picked from commit e9ab0942ac7196429c4a297df9212792729e33f0)
---
 .../tika/parser/xliff/XLIFF12ContentHandler.java   | 25 ++++++++++++++++++----
 .../apache/tika/parser/xliff/XLIFF12Parser.java    |  2 ++
 .../tika/parser/xliff/XLIFF12ParserTest.java       | 10 +++++++--
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
index 954c217..95ea20e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
@@ -40,6 +40,11 @@ public class XLIFF12ContentHandler extends DefaultHandler {
     }
 
     @Override
+    public void startDocument() throws SAXException {
+        xhtml.startDocument();
+    }
+
+    @Override
     public void startElement(
             String uri, String localName, String qName, Attributes attributes)
             throws SAXException {
@@ -58,9 +63,11 @@ public class XLIFF12ContentHandler extends DefaultHandler {
             xhtml.characters(attributes.getValue("original"));
             xhtml.endElement("h1");
 
-            // Add the files source and target languages
+            // Add the files source (mandatory) and target (optional) languages
             metadata.add("source-language", attributes.getValue("source-language"));
-            metadata.add("target-language", attributes.getValue("target-language"));
+            if (null != attributes.getValue("target-language")) {
+                metadata.add("target-language", attributes.getValue("target-language"));
+            }
         }
 
         if ("trans-unit".equals(localName)) {
@@ -70,12 +77,22 @@ public class XLIFF12ContentHandler extends DefaultHandler {
         }
 
         if ("source".equals(localName)) {
-            xhtml.startElement("p", attributeVals);
+            AttributesImpl attrs = extractAttributes(attributes);
+            xhtml.startElement("p", attrs);
         }
 
         if ("target".equals(localName)) {
-            xhtml.startElement("p", attributeVals);
+            AttributesImpl attrs = extractAttributes(attributes);
+            xhtml.startElement("p", attrs);
+        }
+    }
+
+    private AttributesImpl extractAttributes(Attributes attributes) {
+        AttributesImpl attrs = new AttributesImpl();
+        if (null != attributes.getValue("xml:lang")) {
+            attrs.addAttribute("", "lang", "lang", "", attributes.getValue("xml:lang"));
         }
+        return attrs;
     }
 
     @Override
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
index 40218b0..d65a09c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
@@ -23,6 +23,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.ContentHandler;
@@ -67,6 +68,7 @@ public class XLIFF12Parser extends AbstractParser {
         metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString());
 
         final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
         XMLReaderUtils.parseSAX(
                 new CloseShieldInputStream(stream),
                 new OfflineContentHandler(new XLIFF12ContentHandler(xhtml, metadata)),
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
index 9f69ea5..d5e231b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
@@ -16,18 +16,18 @@
  */
 package org.apache.tika.parser.xliff;
 
-import static org.apache.tika.TikaTest.assertContains;
 import static org.junit.Assert.assertEquals;
 
 import java.io.InputStream;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
-public class XLIFF12ParserTest {
+public class XLIFF12ParserTest extends TikaTest {
 
     @Test
     public void testXLIFF12() throws Exception {
@@ -44,5 +44,11 @@ public class XLIFF12ParserTest {
         }
     }
 
+    @Test
+    public void testXLIFF12ToXMLHandler() throws Exception {
+        String xml = getXML("testXLIFF12.xlf").xml;
+        assertContains("<p lang=\"en\">Another trans-unit</p>", xml);
+        assertContains("<p lang=\"fr\">Un autre trans-unit</p>", xml);
+    }
 
 }