You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2019/12/18 23:23:09 UTC

[tika] branch master updated: TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new e9ab094  TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler
     new bdc797a  Merge pull request #306 from dameikle/master
e9ab094 is described below

commit e9ab0942ac7196429c4a297df9212792729e33f0
Author: David Meikle <da...@meikle.io>
AuthorDate: Wed Dec 18 23:21:28 2019 +0000

    TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler
---
 .../tika/parser/xliff/XLIFF12ContentHandler.java   | 25 ++++++++++++++++++----
 .../apache/tika/parser/xliff/XLIFF12Parser.java    |  2 ++
 .../tika/parser/xliff/XLIFF12ParserTest.java       | 10 +++++++--
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
index 954c217..95ea20e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
@@ -40,6 +40,11 @@ public class XLIFF12ContentHandler extends DefaultHandler {
     }
 
     @Override
+    public void startDocument() throws SAXException {
+        xhtml.startDocument();
+    }
+
+    @Override
     public void startElement(
             String uri, String localName, String qName, Attributes attributes)
             throws SAXException {
@@ -58,9 +63,11 @@ public class XLIFF12ContentHandler extends DefaultHandler {
             xhtml.characters(attributes.getValue("original"));
             xhtml.endElement("h1");
 
-            // Add the files source and target languages
+            // Add the files source (mandatory) and target (optional) languages
             metadata.add("source-language", attributes.getValue("source-language"));
-            metadata.add("target-language", attributes.getValue("target-language"));
+            if (null != attributes.getValue("target-language")) {
+                metadata.add("target-language", attributes.getValue("target-language"));
+            }
         }
 
         if ("trans-unit".equals(localName)) {
@@ -70,12 +77,22 @@ public class XLIFF12ContentHandler extends DefaultHandler {
         }
 
         if ("source".equals(localName)) {
-            xhtml.startElement("p", attributeVals);
+            AttributesImpl attrs = extractAttributes(attributes);
+            xhtml.startElement("p", attrs);
         }
 
         if ("target".equals(localName)) {
-            xhtml.startElement("p", attributeVals);
+            AttributesImpl attrs = extractAttributes(attributes);
+            xhtml.startElement("p", attrs);
+        }
+    }
+
+    private AttributesImpl extractAttributes(Attributes attributes) {
+        AttributesImpl attrs = new AttributesImpl();
+        if (null != attributes.getValue("xml:lang")) {
+            attrs.addAttribute("", "lang", "lang", "", attributes.getValue("xml:lang"));
         }
+        return attrs;
     }
 
     @Override
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
index 40218b0..d65a09c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
@@ -23,6 +23,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.ContentHandler;
@@ -67,6 +68,7 @@ public class XLIFF12Parser extends AbstractParser {
         metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString());
 
         final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
         XMLReaderUtils.parseSAX(
                 new CloseShieldInputStream(stream),
                 new OfflineContentHandler(new XLIFF12ContentHandler(xhtml, metadata)),
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
index 9f69ea5..d5e231b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
@@ -16,18 +16,18 @@
  */
 package org.apache.tika.parser.xliff;
 
-import static org.apache.tika.TikaTest.assertContains;
 import static org.junit.Assert.assertEquals;
 
 import java.io.InputStream;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
-public class XLIFF12ParserTest {
+public class XLIFF12ParserTest extends TikaTest {
 
     @Test
     public void testXLIFF12() throws Exception {
@@ -44,5 +44,11 @@ public class XLIFF12ParserTest {
         }
     }
 
+    @Test
+    public void testXLIFF12ToXMLHandler() throws Exception {
+        String xml = getXML("testXLIFF12.xlf").xml;
+        assertContains("<p lang=\"en\">Another trans-unit</p>", xml);
+        assertContains("<p lang=\"fr\">Un autre trans-unit</p>", xml);
+    }
 
 }