You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/07 23:47:00 UTC
svn commit: r1443764 -
/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
Author: kwright
Date: Thu Feb 7 22:46:59 2013
New Revision: 1443764
URL: http://svn.apache.org/r1443764
Log:
Add btag parsing
Modified:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1443764&r1=1443763&r2=1443764&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Thu Feb 7 22:46:59 2013
@@ -39,7 +39,7 @@ import java.util.*;
public class TagParseState extends SingleCharacterReceiver
{
protected static final int TAGPARSESTATE_NORMAL = 0;
- protected static final int TAGPARSESTATE_SAWLEFTBRACKET = 1;
+ protected static final int TAGPARSESTATE_SAWLEFTANGLE = 1;
protected static final int TAGPARSESTATE_SAWEXCLAMATION = 2;
protected static final int TAGPARSESTATE_SAWDASH = 3;
protected static final int TAGPARSESTATE_IN_COMMENT = 4;
@@ -62,8 +62,12 @@ public class TagParseState extends Singl
protected static final int TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE = 21;
protected static final int TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE = 22;
protected static final int TAGPARSESTATE_IN_QTAG_UNQUOTED_ATTR_VALUE = 23;
-
- // These still need to be added to the case statement
+ protected static final int TAGPARSESTATE_IN_BRACKET_TOKEN = 24;
+ protected static final int TAGPARSESTATE_NEED_FINAL_BRACKET = 25;
+ protected static final int TAGPARSESTATE_IN_BANG_TOKEN = 26;
+ protected static final int TAGPARSESTATE_IN_CDATA_BODY = 27;
+ protected static final int TAGPARSESTATE_SAWRIGHTBRACKET = 28;
+ protected static final int TAGPARSESTATE_SAWSECONDRIGHTBRACKET = 29;
protected int currentState = TAGPARSESTATE_NORMAL;
@@ -102,13 +106,56 @@ public class TagParseState extends Singl
{
case TAGPARSESTATE_NORMAL:
if (thisChar == '<')
- currentState = TAGPARSESTATE_SAWLEFTBRACKET;
+ currentState = TAGPARSESTATE_SAWLEFTANGLE;
+ else if (thisChar == '>')
+ {
+ if (noteEndBTag())
+ return true;
+ }
else
+ {
if (noteNormalCharacter(thisChar))
return true;
+ }
break;
- case TAGPARSESTATE_SAWLEFTBRACKET:
+ case TAGPARSESTATE_IN_CDATA_BODY:
+ if (thisChar == ']')
+ currentState = TAGPARSESTATE_SAWRIGHTBRACKET;
+ else
+ {
+ if (noteEscapedCharacter(thisChar))
+ return true;
+ }
+ break;
+
+ case TAGPARSESTATE_SAWRIGHTBRACKET:
+ if (thisChar == ']')
+ currentState = TAGPARSESTATE_SAWSECONDRIGHTBRACKET;
+ else
+ {
+ if (noteEscapedCharacter(']'))
+ return true;
+ if (noteEscapedCharacter(thisChar))
+ return true;
+ }
+ break;
+
+ case TAGPARSESTATE_SAWSECONDRIGHTBRACKET:
+ if (thisChar == '>')
+ currentState = TAGPARSESTATE_NORMAL;
+ else
+ {
+ if (noteEscapedCharacter(']'))
+ return true;
+ if (noteEscapedCharacter(']'))
+ return true;
+ if (noteEscapedCharacter(thisChar))
+ return true;
+ }
+ break;
+
+ case TAGPARSESTATE_SAWLEFTANGLE:
if (thisChar == '!')
currentState = TAGPARSESTATE_SAWEXCLAMATION;
else if (thisChar == '?')
@@ -133,9 +180,20 @@ public class TagParseState extends Singl
case TAGPARSESTATE_SAWEXCLAMATION:
if (thisChar == '-')
currentState = TAGPARSESTATE_SAWDASH;
+ else if (thisChar == '[')
+ {
+ currentState = TAGPARSESTATE_IN_BRACKET_TOKEN;
+ currentTagNameBuffer = new StringBuilder();
+ }
else
- currentState = TAGPARSESTATE_NORMAL;
+ {
+ currentState = TAGPARSESTATE_IN_BANG_TOKEN;
+ currentTagNameBuffer = new StringBuilder();
+ if (!isWhitespace(thisChar))
+ currentTagNameBuffer.append(thisChar);
+ }
break;
+
case TAGPARSESTATE_SAWDASH:
if (thisChar == '-')
currentState = TAGPARSESTATE_IN_COMMENT;
@@ -214,6 +272,70 @@ public class TagParseState extends Singl
currentTagNameBuffer.append(thisChar);
break;
+ case TAGPARSESTATE_IN_BRACKET_TOKEN:
+ if (isWhitespace(thisChar))
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ // Done with the bracket token!
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentState = TAGPARSESTATE_NEED_FINAL_BRACKET;
+ }
+ }
+ else if (thisChar == '[')
+ {
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentState = TAGPARSESTATE_IN_CDATA_BODY;
+ if (noteEscaped(currentTagName))
+ return true;
+ currentTagName = null;
+ }
+ else
+ currentTagNameBuffer.append(thisChar);
+ break;
+
+ case TAGPARSESTATE_NEED_FINAL_BRACKET:
+ if (thisChar == '[')
+ {
+ if (noteEscaped(currentTagName))
+ return true;
+ currentTagName = null;
+ currentState = TAGPARSESTATE_IN_CDATA_BODY;
+ }
+ break;
+
+ case TAGPARSESTATE_IN_BANG_TOKEN:
+ if (isWhitespace(thisChar))
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ // Done with bang token
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ if (noteBTag(currentTagName))
+ return true;
+ currentTagName = null;
+ currentState = TAGPARSESTATE_NORMAL;
+ }
+ }
+ else if (thisChar == '>')
+ {
+ // Also done, but signal end too.
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ if (noteBTag(currentTagName))
+ return true;
+ currentTagName = null;
+ currentState = TAGPARSESTATE_NORMAL;
+ if (noteEndBTag())
+ return true;
+ }
+ else
+ currentTagNameBuffer.append(thisChar);
+ break;
+
case TAGPARSESTATE_IN_TAG_NAME:
if (isWhitespace(thisChar))
{
@@ -640,14 +762,14 @@ public class TagParseState extends Singl
}
/** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]>
- *@param token may be null!!!
+ *@param token may be empty!!!
*@return true to halt further processing.
*/
protected boolean noteEscaped(String token)
throws ManifoldCFException
{
if (Logging.misc.isDebugEnabled())
- Logging.misc.debug(" Saw escaped '"+((token==null)?null:token)+"'");
+ Logging.misc.debug(" Saw escaped block '"+token+"'");
return false;
}
@@ -657,7 +779,7 @@ public class TagParseState extends Singl
protected boolean noteEndEscaped()
throws ManifoldCFException
{
- Logging.misc.debug(" Saw end escaped");
+ Logging.misc.debug(" Saw end of escaped block");
return false;
}
@@ -670,6 +792,17 @@ public class TagParseState extends Singl
{
return false;
}
+
+ /** This method gets called for every character that is found within an
+ * escape block, e.g. CDATA.
+ * Override this method to intercept such characters.
+ *@return true to halt further processing.
+ */
+ protected boolean noteEscapedCharacter(char thisChar)
+ throws ManifoldCFException
+ {
+ return false;
+ }
/** Decode body text */
protected static String bodyDecode(String input)