You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/09 17:11:13 UTC

svn commit: r1444388 - /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java

Author: kwright
Date: Sat Feb  9 16:11:13 2013
New Revision: 1444388

URL: http://svn.apache.org/r1444388
Log:
Hook up stateful handling of body escapes.

Modified:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1444388&r1=1444387&r2=1444388&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Sat Feb  9 16:11:13 2013
@@ -89,6 +89,13 @@ public class TagParseState extends Singl
   protected String currentAttrName = null;
   protected List<AttrNameValue> currentAttrList = null;
 
+  // Body decoding state
+
+  /** Whether we've seen an ampersand */
+  protected boolean inAmpersand = false;
+  /** Buffer of characters seen after ampersand. */
+  protected StringBuilder ampBuffer = new StringBuilder();
+
   protected static final Map<String,String> mapLookup = new HashMap<String,String>();
   static
   {
@@ -116,7 +123,14 @@ public class TagParseState extends Singl
     {
     case TAGPARSESTATE_NORMAL:
       if (thisChar == '<')
+      {
+        if (inAmpersand)
+        {
+          outputAmpBuffer();
+          inAmpersand = false;
+        }
         currentState = TAGPARSESTATE_SAWLEFTANGLE;
+      }
       else if (bTagDepth > 0 && thisChar == '>')
       {
         // Output current token, if any
@@ -134,8 +148,38 @@ public class TagParseState extends Singl
       }
       else if (bTagDepth == 0)
       {
-        if (noteNormalCharacter(thisChar))
-          return true;
+        if (inAmpersand)
+        {
+          if (thisChar == ';')
+          {
+            // We append the semi so that the output function can make good decisions
+            ampBuffer.append(thisChar);
+            if (outputAmpBuffer())
+              return true;
+            inAmpersand = false;
+          }
+          else if (isWhitespace(thisChar))
+          {
+            // Interpret ampersand buffer.
+            if (outputAmpBuffer())
+              return true;
+            inAmpersand = false;
+            if (noteNormalCharacter(thisChar))
+              return true;
+          }
+          else
+            ampBuffer.append(thisChar);
+        }
+        else if (thisChar == '&')
+        {
+          inAmpersand = true;
+          ampBuffer.setLength(0);
+        }
+        else
+        {
+          if (noteNormalCharacter(thisChar))
+            return true;
+        }
       }
       else
       {
@@ -773,6 +817,45 @@ public class TagParseState extends Singl
     return false;
   }
 
+  /** Interpret ampersand buffer.
+  */
+  protected boolean outputAmpBuffer()
+    throws ManifoldCFException
+  {
+    if (ampBuffer.length() == 0 || (ampBuffer.length() == 1 && ampBuffer.charAt(0) == ';'))
+    {
+      // Length is zero; probably a mistake, so just output the whole thing
+      if (dumpValues(ampBuffer.toString()))
+        return true;
+      return false;
+    }
+    else
+    {
+      // Is it a known entity?
+      String entity = ampBuffer.toString();
+      if (entity.endsWith(";"))
+        entity = entity.substring(0,entity.length()-1);
+      String replacement = mapChunk(entity);
+      if (replacement != null)
+      {
+        if (dumpValues(replacement))
+          return true;
+      }
+      return false;
+    }
+  }
+  
+  protected boolean dumpValues(String value)
+    throws ManifoldCFException
+  {
+    for (int i = 0; i < value.length(); i++)
+    {
+      if (noteNormalCharacter(value.charAt(i)))
+        return true;
+    }
+    return false;
+  }
+  
   /** This method gets called for every tag.  Override this method to intercept tag begins.
   *@return true to halt further processing.
   */
@@ -882,12 +965,6 @@ public class TagParseState extends Singl
     return false;
   }
   
-  /** Decode body text */
-  protected static String bodyDecode(String input)
-  {
-    return attributeDecode(input);
-  }
-  
   /** Decode an html attribute */
   protected static String attributeDecode(String input)
   {