You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/09 17:11:13 UTC
svn commit: r1444388 -
/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
Author: kwright
Date: Sat Feb 9 16:11:13 2013
New Revision: 1444388
URL: http://svn.apache.org/r1444388
Log:
Hook up stateful handling of body escapes.
Modified:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1444388&r1=1444387&r2=1444388&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Sat Feb 9 16:11:13 2013
@@ -89,6 +89,13 @@ public class TagParseState extends Singl
protected String currentAttrName = null;
protected List<AttrNameValue> currentAttrList = null;
+ // Body decoding state
+
+ /** Whether we've seen an ampersand */
+ protected boolean inAmpersand = false;
+ /** Buffer of characters seen after ampersand. */
+ protected StringBuilder ampBuffer = new StringBuilder();
+
protected static final Map<String,String> mapLookup = new HashMap<String,String>();
static
{
@@ -116,7 +123,14 @@ public class TagParseState extends Singl
{
case TAGPARSESTATE_NORMAL:
if (thisChar == '<')
+ {
+ if (inAmpersand)
+ {
+ outputAmpBuffer();
+ inAmpersand = false;
+ }
currentState = TAGPARSESTATE_SAWLEFTANGLE;
+ }
else if (bTagDepth > 0 && thisChar == '>')
{
// Output current token, if any
@@ -134,8 +148,38 @@ public class TagParseState extends Singl
}
else if (bTagDepth == 0)
{
- if (noteNormalCharacter(thisChar))
- return true;
+ if (inAmpersand)
+ {
+ if (thisChar == ';')
+ {
+ // We append the semi so that the output function can make good decisions
+ ampBuffer.append(thisChar);
+ if (outputAmpBuffer())
+ return true;
+ inAmpersand = false;
+ }
+ else if (isWhitespace(thisChar))
+ {
+ // Interpret ampersand buffer.
+ if (outputAmpBuffer())
+ return true;
+ inAmpersand = false;
+ if (noteNormalCharacter(thisChar))
+ return true;
+ }
+ else
+ ampBuffer.append(thisChar);
+ }
+ else if (thisChar == '&')
+ {
+ inAmpersand = true;
+ ampBuffer.setLength(0);
+ }
+ else
+ {
+ if (noteNormalCharacter(thisChar))
+ return true;
+ }
}
else
{
@@ -773,6 +817,45 @@ public class TagParseState extends Singl
return false;
}
+ /** Interpret ampersand buffer.
+ */
+ protected boolean outputAmpBuffer()
+ throws ManifoldCFException
+ {
+ if (ampBuffer.length() == 0 || (ampBuffer.length() == 1 && ampBuffer.charAt(0) == ';'))
+ {
+ // Length is zero; probably a mistake, so just output the whole thing
+ if (dumpValues(ampBuffer.toString()))
+ return true;
+ return false;
+ }
+ else
+ {
+ // Is it a known entity?
+ String entity = ampBuffer.toString();
+ if (entity.endsWith(";"))
+ entity = entity.substring(0,entity.length()-1);
+ String replacement = mapChunk(entity);
+ if (replacement != null)
+ {
+ if (dumpValues(replacement))
+ return true;
+ }
+ return false;
+ }
+ }
+
+ protected boolean dumpValues(String value)
+ throws ManifoldCFException
+ {
+ for (int i = 0; i < value.length(); i++)
+ {
+ if (noteNormalCharacter(value.charAt(i)))
+ return true;
+ }
+ return false;
+ }
+
/** This method gets called for every tag. Override this method to intercept tag begins.
*@return true to halt further processing.
*/
@@ -882,12 +965,6 @@ public class TagParseState extends Singl
return false;
}
- /** Decode body text */
- protected static String bodyDecode(String input)
- {
- return attributeDecode(input);
- }
-
/** Decode an html attribute */
protected static String attributeDecode(String input)
{