You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sling.apache.org by je...@apache.org on 2019/07/08 17:52:34 UTC

[sling-org-apache-sling-commons-html] 01/01: SLING-8566 support processing instruction and xml declaration

This is an automated email from the ASF dual-hosted git repository.

jeb pushed a commit to branch SLING-8566
in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-commons-html.git

commit a8a3ef02a5c12d4cf2f019c83594dc0d412b5717
Author: JE Bailey <je...@apache.org>
AuthorDate: Mon Jul 8 13:34:12 2019 -0400

    SLING-8566 support processing instruction and xml declaration
---
 pom.xml                                            |  2 +-
 .../sling/commons/html/impl/parser/TagParser.java  |  7 ++-
 .../commons/html/internal/TagstreamHtmlParser.java |  5 +-
 .../sling/commons/html/util/HtmlSAXSupport.java    | 56 ++++++++++++++++------
 .../sling/commons/html/util/package-info.java      |  2 +-
 src/main/javacc/htmlParser.jj                      |  7 ++-
 .../sling/commons/html/TagstreamHtmlParseTest.java | 26 ++++++----
 7 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/pom.xml b/pom.xml
index b886200..0388733 100644
--- a/pom.xml
+++ b/pom.xml
@@ -28,7 +28,7 @@
     </parent>
 
     <artifactId>org.apache.sling.commons.html</artifactId>
-    <version>1.1.1-SNAPSHOT</version>
+    <version>1.2.0-SNAPSHOT</version>
 
     <name>Apache Sling Commons HTML Utilities</name>
     <description>
diff --git a/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java b/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java
index 7e36805..541070d 100644
--- a/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java
+++ b/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java
@@ -14,10 +14,13 @@ public class TagParser implements TagParserConstants {
     for (t=first; t != cur.next; t = t.next) {
       if (t.specialToken != null) {
         Token tt=t.specialToken;
-        while (tt.specialToken != null)
+        while (tt.specialToken != null) {
           tt = tt.specialToken;
-        for (; tt != null; tt = tt.next)
+        }
+        while (tt != null) {
           sb.append(tt.image);
+          tt = tt.next;
+        }
       };
       sb.append(t.image);
     };
diff --git a/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java b/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java
index aeea507..f2f2101 100644
--- a/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java
+++ b/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java
@@ -52,7 +52,10 @@ public class TagstreamHtmlParser implements HtmlParser {
     @Override
     public Document parse(String systemId, InputStream stream, String encoding) throws IOException {
         final DOMBuilder builder = new DOMBuilder();
-        Html.stream(stream, encoding).forEach(new HtmlSAXSupport(builder, builder));
+        HtmlSAXSupport support = new HtmlSAXSupport(builder, builder);
+        support.startDocument();
+        Html.stream(stream, encoding).forEach(support);
+        support.endDocument();
         return builder.getDocument();
     }
 
diff --git a/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java b/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java
index 810a929..32182d2 100644
--- a/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java
+++ b/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java
@@ -13,6 +13,7 @@
  */
 package org.apache.sling.commons.html.util;
 
+import java.io.IOException;
 import java.util.Map;
 import java.util.function.Consumer;
 
@@ -25,19 +26,23 @@ import org.xml.sax.ext.Attributes2Impl;
 import org.xml.sax.ext.DefaultHandler2;
 import org.xml.sax.ext.LexicalHandler;
 
+/**
+ * Utility Class for the TagstreamHTMLParser to generate SAX events
+ * 
+ *
+ */
 public class HtmlSAXSupport implements Consumer<HtmlElement> {
-    
+
     private static final DefaultHandler2 handler = new DefaultHandler2();
-    
+
     private ContentHandler contentHandler = handler;
     private LexicalHandler lexicalHandler = handler;
-    private boolean initialized;
 
     public HtmlSAXSupport(ContentHandler ch, final LexicalHandler lh) {
         if (ch != null) {
             contentHandler = ch;
         }
-        if (lh != null ) {
+        if (lh != null) {
             lexicalHandler = lh;
         }
     }
@@ -45,10 +50,6 @@ public class HtmlSAXSupport implements Consumer<HtmlElement> {
     @Override
     public void accept(HtmlElement element) {
         try {
-            if (!initialized) {
-                contentHandler.startDocument();
-                initialized = true;
-            }
             String value = element.getValue();
             switch (element.getType()) {
             case COMMENT:
@@ -64,6 +65,12 @@ public class HtmlSAXSupport implements Consumer<HtmlElement> {
                 contentHandler.endDocument();
                 break;
             case START_TAG:
+                if (value.startsWith("?")) {
+                    if (!value.equalsIgnoreCase("?xml")) {
+                        contentHandler.processingInstruction(value, attrsToString(element.getAttributes()));
+                    }
+                    break;
+                }
                 lexicalHandler.startEntity(value);
                 contentHandler.startElement("", value, value, HtmlSAXSupport.convert(element.getAttributes()));
                 break;
@@ -74,17 +81,38 @@ public class HtmlSAXSupport implements Consumer<HtmlElement> {
                 break;
             }
         } catch (SAXException se) {
-            //log message
+            //se.printStackTrace();
         }
 
     }
-    
-    public static Attributes convert(Map<String,AttrValue> attributes) {
+
+    public static Attributes convert(Map<String, AttrValue> attributes) {
         Attributes2Impl response = new Attributes2Impl();
-        attributes.entrySet().forEach(attr ->
-            response.addAttribute("",attr.getKey(), attr.getKey(), "xsi:String", attr.getValue().toString())
-        );
+        attributes.entrySet().forEach(attr -> response.addAttribute("", attr.getKey(), attr.getKey(), "xsi:String",
+                attr.getValue().toString()));
         return response;
     }
 
+    public void startDocument() throws IOException {
+        try {
+            contentHandler.startDocument();
+        } catch (SAXException e) {
+            throw new IOException(e);
+        }
+    }
+
+    public void endDocument() throws IOException {
+        try {
+            contentHandler.endDocument();
+        } catch (SAXException e) {
+            throw new IOException(e);
+        }
+    }
+    
+    private String attrsToString(Map<String, AttrValue> attributes) {
+        StringBuilder sb = new StringBuilder();
+        attributes.entrySet().forEach(attr -> sb.append(attr.toString()));
+        return sb.toString();
+    }
+
 }
diff --git a/src/main/java/org/apache/sling/commons/html/util/package-info.java b/src/main/java/org/apache/sling/commons/html/util/package-info.java
index 23efc4f..7aeb4ff 100644
--- a/src/main/java/org/apache/sling/commons/html/util/package-info.java
+++ b/src/main/java/org/apache/sling/commons/html/util/package-info.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-@Version("1.1.0")
+@Version("1.2.0")
 package org.apache.sling.commons.html.util;
 
 import org.osgi.annotation.versioning.Version;
diff --git a/src/main/javacc/htmlParser.jj b/src/main/javacc/htmlParser.jj
index 8a74ab7..f868e2b 100644
--- a/src/main/javacc/htmlParser.jj
+++ b/src/main/javacc/htmlParser.jj
@@ -34,10 +34,13 @@ public class TagParser {
     for (t=first; t != cur.next; t = t.next) {
       if (t.specialToken != null) {
         Token tt=t.specialToken;
-        while (tt.specialToken != null) 
+        while (tt.specialToken != null) {
           tt = tt.specialToken;
-        for (; tt != null; tt = tt.next) 
+        }
+        while (tt != null) { 
           sb.append(tt.image);
+          tt = tt.next;
+        }
       };
       sb.append(t.image);
     };
diff --git a/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java b/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java
index 04b8183..a3579df 100644
--- a/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java
+++ b/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java
@@ -18,8 +18,10 @@
 package org.apache.sling.commons.html;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 
+import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
 import java.util.function.Function;
@@ -29,6 +31,7 @@ import org.apache.sling.commons.html.internal.TagstreamHtmlParser;
 import org.apache.sling.commons.html.util.HtmlSAXSupport;
 import org.junit.Before;
 import org.junit.Test;
+import org.w3c.dom.Document;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.ext.DefaultHandler2;
@@ -43,11 +46,11 @@ public class TagstreamHtmlParseTest {
     private HtmlParser htmlParser;
 
     /*
-     * Japanese (google) translation of 'Don't forget me this weekend!' 
-     * standard text of xml sample note.xml
+     * Japanese (google) translation of 'Don't forget me this weekend!' standard
+     * text of xml sample note.xml
      */
-    private static final String MESSAGE ="この週末私を忘れないで!";
-    
+    private static final String MESSAGE = "この週末私を忘れないで!";
+
     @Before
     public void setUp() throws ParseException, Exception {
         InputStream is = this.getClass().getResourceAsStream("/demo.html");
@@ -79,7 +82,7 @@ public class TagstreamHtmlParseTest {
             @Override
             public void startElement(String uri, String localName, String qName, Attributes attributes)
                     throws SAXException {
-                //System.out.println(localName);
+                // System.out.println(localName);
             }
 
         }, new DefaultHandler2());
@@ -90,7 +93,7 @@ public class TagstreamHtmlParseTest {
     public void docParseTagTest3() throws Exception {
         long count = stream.flatMap(TagMapper.map((element, process) -> {
             if (element.containsAttribute("href")) {
-                //System.out.println(element.getAttributeValue("href"));
+                // System.out.println(element.getAttributeValue("href"));
                 process.next(element);
             }
         })).count();
@@ -121,10 +124,9 @@ public class TagstreamHtmlParseTest {
 
     @Test
     public void convertLinkAndPrintTest() throws Exception {
-        //stream.flatMap(CONVERT_LINKS).map(HtmlStreams.TO_HTML).forEach(System.out::print);
+        // stream.flatMap(CONVERT_LINKS).map(HtmlStreams.TO_HTML).forEach(System.out::print);
     }
 
-    
     @Before
     public void setup() {
 
@@ -142,7 +144,13 @@ public class TagstreamHtmlParseTest {
             }
         });
     }
-    
+
+    @Test
+    public void testDomSupport() throws SAXException, IOException {
+        Document dom = htmlParser.parse("123456", inputStream, "UTF-8");
+        assertNotEquals(dom, null);
+    }
+
     @Test
     public void testEncodingSupportFailure() throws SAXException {
         htmlParser.parse(inputStream, "ISO8859-1", new DefaultHandler() {