You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/07 04:16:42 UTC
svn commit: r1443307 - in
/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml:
PretagParseState.java TagParseState.java
Author: kwright
Date: Thu Feb 7 03:16:42 2013
New Revision: 1443307
URL: http://svn.apache.org/viewvc?rev=1443307&view=rev
Log:
Fold pretag parse state into tag parse state. It makes no sense to try to put xml preamble and entity processing in another module. The code is not complete yet but the basic idea is fleshed out now.
Removed:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/PretagParseState.java
Modified:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1443307&r1=1443306&r2=1443307&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Thu Feb 7 03:16:42 2013
@@ -22,7 +22,20 @@ import org.apache.manifoldcf.core.interf
import org.apache.manifoldcf.core.system.Logging;
import java.util.*;
-/** This class represents the basic, outermost tag parsing state. */
+/** This class represents a basic xml/html tag parser.
+* It is capable of recognizing the following xml and html constructs:
+*
+* '<' <token> <attrs> '>' ... '</' <token> '>'
+* '<' <token> <attrs> '/>'
+* '<?' <token> <attrs> '?>'
+* '<![' [<token>] '[' ... ']]>'
+* '<!' <token> ... '>'
+* '<!--' ... '-->'
+*
+* Each of these, save the comment, has supporting protected methods that will be
+* called by the parsing engine. Overriding these methods will allow an extending
+* class to perform higher-level data extraction and parsing.
+*/
public class TagParseState extends SingleCharacterReceiver
{
protected static final int TAGPARSESTATE_NORMAL = 0;
@@ -82,7 +95,8 @@ public class TagParseState extends Singl
if (thisChar == '<')
currentState = TAGPARSESTATE_SAWLEFTBRACKET;
else
- noteNormalCharacter(thisChar);
+ if (noteNormalCharacter(thisChar))
+ return true;
break;
case TAGPARSESTATE_SAWLEFTBRACKET:
if (thisChar == '!')
@@ -150,7 +164,8 @@ public class TagParseState extends Singl
currentTagNameBuffer = null;
currentAttrMap = new HashMap<String,String>();
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
}
else
{
@@ -168,7 +183,8 @@ public class TagParseState extends Singl
}
if (currentTagName != null)
{
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
}
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
@@ -210,7 +226,8 @@ public class TagParseState extends Singl
currentAttrMap.put(currentAttrName,"");
currentAttrName = null;
}
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
}
else if (thisChar == '>')
@@ -226,7 +243,8 @@ public class TagParseState extends Singl
currentAttrName = null;
}
currentState = TAGPARSESTATE_NORMAL;
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
currentTagName = null;
currentAttrMap = null;
}
@@ -242,7 +260,8 @@ public class TagParseState extends Singl
else if (thisChar == '>')
{
currentState = TAGPARSESTATE_NORMAL;
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
currentTagName = null;
currentAttrMap = null;
}
@@ -251,7 +270,8 @@ public class TagParseState extends Singl
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
currentAttrMap.put(currentAttrName,"");
currentAttrName = null;
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
}
else if (!isWhitespace(thisChar))
{
@@ -276,7 +296,8 @@ public class TagParseState extends Singl
case TAGPARSESTATE_IN_TAG_SAW_SLASH:
if (thisChar == '>')
{
- noteEndTag(currentTagName);
+ if (noteEndTag(currentTagName))
+ return true;
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
currentAttrMap = null;
@@ -301,7 +322,8 @@ public class TagParseState extends Singl
}
if (currentTagName != null)
{
- noteEndTag(currentTagName);
+ if (noteEndTag(currentTagName))
+ return true;
}
currentTagName = null;
currentState = TAGPARSESTATE_NORMAL;
@@ -345,7 +367,8 @@ public class TagParseState extends Singl
else if (thisChar == '/')
{
currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
}
else if (thisChar == '>')
@@ -354,7 +377,8 @@ public class TagParseState extends Singl
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_NORMAL;
- noteTag(currentTagName,currentAttrMap);
+ if (noteTag(currentTagName,currentAttrMap))
+ return true;
currentTagName = null;
currentAttrMap = null;
}
@@ -367,21 +391,93 @@ public class TagParseState extends Singl
return false;
}
- protected void noteTag(String tagName, Map<String,String> attributes)
+ /** This method gets called for every tag. Override this method to intercept tag begins.
+ *@return true to halt further processing.
+ */
+ protected boolean noteTag(String tagName, Map<String,String> attributes)
throws ManifoldCFException
{
- Logging.misc.debug(" Saw tag '"+tagName+"'");
+ if (Logging.misc.isDebugEnabled())
+ Logging.misc.debug(" Saw tag '"+tagName+"'");
+ return false;
}
- protected void noteEndTag(String tagName)
+ /** This method gets called for every end tag. Override this method to intercept tag ends.
+ *@return true to halt further processing.
+ */
+ protected boolean noteEndTag(String tagName)
throws ManifoldCFException
{
- Logging.misc.debug(" Saw end tag '"+tagName+"'");
+ if (Logging.misc.isDebugEnabled())
+ Logging.misc.debug(" Saw end tag '"+tagName+"'");
+ return false;
}
- protected void noteNormalCharacter(char thisChar)
+ /** This method is called for every <? ... ?> construct, or 'qtag'.
+ * Override it to intercept such constructs.
+ *@return true to halt further processing.
+ */
+ protected boolean noteQTag(String tagName, Map<String,String> attributes)
+ throws ManifoldCFException
+ {
+ if (Logging.misc.isDebugEnabled())
+ Logging.misc.debug(" Saw QTag '"+tagName+"'");
+ return false;
+ }
+
+ /** This method is called for every <! <token> ... > construct, or 'btag'.
+ * Override it to intercept these.
+ *@return true to halt further processing.
+ */
+ protected boolean noteBTag(String tagName)
+ throws ManifoldCFException
+ {
+ if (Logging.misc.isDebugEnabled())
+ Logging.misc.debug(" Saw BTag '"+tagName+"'");
+ return false;
+ }
+
+ /** This method is called for the end of every btag, or any time
+ * there's a naked '>' in the document. Override it if you want to intercept these.
+ *@return true to halt further processing.
+ */
+ protected boolean noteEndBTag()
+ throws ManifoldCFException
+ {
+ Logging.misc.debug(" Saw end BTag");
+ return false;
+ }
+
+ /** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]>
+ *@param token may be null!!!
+ *@return true to halt further processing.
+ */
+ protected boolean noteEscaped(String token)
+ throws ManifoldCFException
+ {
+ if (Logging.misc.isDebugEnabled())
+ Logging.misc.debug(" Saw escaped '"+((token==null)?null:token)+"'");
+ return false;
+ }
+
+ /** Called for the end of every cdata-like tag.
+ *@return true to halt further processing.
+ */
+ protected boolean noteEndEscaped()
throws ManifoldCFException
{
+ Logging.misc.debug(" Saw end escaped");
+ return false;
+ }
+
+ /** This method gets called for every character that is not part of a tag etc.
+ * Override this method to intercept such characters.
+ *@return true to halt further processing.
+ */
+ protected boolean noteNormalCharacter(char thisChar)
+ throws ManifoldCFException
+ {
+ return false;
}
/** Decode body text */