You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2019/12/18 23:23:09 UTC
[tika] branch master updated: TIKA-3014: Update to fix
XLIFF12Parser failures with ToXMLHandler
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new e9ab094 TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler
new bdc797a Merge pull request #306 from dameikle/master
e9ab094 is described below
commit e9ab0942ac7196429c4a297df9212792729e33f0
Author: David Meikle <da...@meikle.io>
AuthorDate: Wed Dec 18 23:21:28 2019 +0000
TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler
---
.../tika/parser/xliff/XLIFF12ContentHandler.java | 25 ++++++++++++++++++----
.../apache/tika/parser/xliff/XLIFF12Parser.java | 2 ++
.../tika/parser/xliff/XLIFF12ParserTest.java | 10 +++++++--
3 files changed, 31 insertions(+), 6 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
index 954c217..95ea20e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
@@ -40,6 +40,11 @@ public class XLIFF12ContentHandler extends DefaultHandler {
}
@Override
+ public void startDocument() throws SAXException {
+ xhtml.startDocument();
+ }
+
+ @Override
public void startElement(
String uri, String localName, String qName, Attributes attributes)
throws SAXException {
@@ -58,9 +63,11 @@ public class XLIFF12ContentHandler extends DefaultHandler {
xhtml.characters(attributes.getValue("original"));
xhtml.endElement("h1");
- // Add the files source and target languages
+ // Add the files source (mandatory) and target (optional) languages
metadata.add("source-language", attributes.getValue("source-language"));
- metadata.add("target-language", attributes.getValue("target-language"));
+ if (null != attributes.getValue("target-language")) {
+ metadata.add("target-language", attributes.getValue("target-language"));
+ }
}
if ("trans-unit".equals(localName)) {
@@ -70,12 +77,22 @@ public class XLIFF12ContentHandler extends DefaultHandler {
}
if ("source".equals(localName)) {
- xhtml.startElement("p", attributeVals);
+ AttributesImpl attrs = extractAttributes(attributes);
+ xhtml.startElement("p", attrs);
}
if ("target".equals(localName)) {
- xhtml.startElement("p", attributeVals);
+ AttributesImpl attrs = extractAttributes(attributes);
+ xhtml.startElement("p", attrs);
+ }
+ }
+
+ private AttributesImpl extractAttributes(Attributes attributes) {
+ AttributesImpl attrs = new AttributesImpl();
+ if (null != attributes.getValue("xml:lang")) {
+ attrs.addAttribute("", "lang", "lang", "", attributes.getValue("xml:lang"));
}
+ return attrs;
}
@Override
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
index 40218b0..d65a09c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
@@ -23,6 +23,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.ContentHandler;
@@ -67,6 +68,7 @@ public class XLIFF12Parser extends AbstractParser {
metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString());
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new XLIFF12ContentHandler(xhtml, metadata)),
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
index 9f69ea5..d5e231b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
@@ -16,18 +16,18 @@
*/
package org.apache.tika.parser.xliff;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import java.io.InputStream;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-public class XLIFF12ParserTest {
+public class XLIFF12ParserTest extends TikaTest {
@Test
public void testXLIFF12() throws Exception {
@@ -44,5 +44,11 @@ public class XLIFF12ParserTest {
}
}
+ @Test
+ public void testXLIFF12ToXMLHandler() throws Exception {
+ String xml = getXML("testXLIFF12.xlf").xml;
+ assertContains("<p lang=\"en\">Another trans-unit</p>", xml);
+ assertContains("<p lang=\"fr\">Un autre trans-unit</p>", xml);
+ }
}