You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/07 04:16:42 UTC

svn commit: r1443307 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: PretagParseState.java TagParseState.java

Author: kwright
Date: Thu Feb  7 03:16:42 2013
New Revision: 1443307

URL: http://svn.apache.org/viewvc?rev=1443307&view=rev
Log:
Fold pretag parse state into tag parse state.  It makes no sense to try to put xml preamble and entity processing in another module.  The code is not complete yet but the basic idea is fleshed out now.

Removed:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/PretagParseState.java
Modified:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1443307&r1=1443306&r2=1443307&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Thu Feb  7 03:16:42 2013
@@ -22,7 +22,20 @@ import org.apache.manifoldcf.core.interf
 import org.apache.manifoldcf.core.system.Logging;
 import java.util.*;
 
-/** This class represents the basic, outermost tag parsing state. */
+/** This class represents a basic xml/html tag parser.
+* It is capable of recognizing the following xml and html constructs:
+*
+* '<' <token> <attrs> '>' ... '</' <token> '>'
+* '<' <token> <attrs> '/>'
+* '<?' <token> <attrs>  '?>'
+* '<![' [<token>] '[' ... ']]>'
+* '<!' <token> ... '>'
+* '<!--' ... '-->'
+*
+* Each of these, save the comment, has supporting protected methods that will be
+* called by the parsing engine.  Overriding these methods will allow an extending
+* class to perform higher-level data extraction and parsing.
+*/
 public class TagParseState extends SingleCharacterReceiver
 {
   protected static final int TAGPARSESTATE_NORMAL = 0;
@@ -82,7 +95,8 @@ public class TagParseState extends Singl
       if (thisChar == '<')
         currentState = TAGPARSESTATE_SAWLEFTBRACKET;
       else
-        noteNormalCharacter(thisChar);
+        if (noteNormalCharacter(thisChar))
+          return true;
       break;
     case TAGPARSESTATE_SAWLEFTBRACKET:
       if (thisChar == '!')
@@ -150,7 +164,8 @@ public class TagParseState extends Singl
           currentTagNameBuffer = null;
           currentAttrMap = new HashMap<String,String>();
           currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
-          noteTag(currentTagName,currentAttrMap);
+          if (noteTag(currentTagName,currentAttrMap))
+            return true;
         }
         else
         {
@@ -168,7 +183,8 @@ public class TagParseState extends Singl
         }
         if (currentTagName != null)
         {
-          noteTag(currentTagName,currentAttrMap);
+          if (noteTag(currentTagName,currentAttrMap))
+            return true;
         }
         currentState = TAGPARSESTATE_NORMAL;
         currentTagName = null;
@@ -210,7 +226,8 @@ public class TagParseState extends Singl
           currentAttrMap.put(currentAttrName,"");
           currentAttrName = null;
         }
-        noteTag(currentTagName,currentAttrMap);
+        if (noteTag(currentTagName,currentAttrMap))
+          return true;
         currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
       }
       else if (thisChar == '>')
@@ -226,7 +243,8 @@ public class TagParseState extends Singl
           currentAttrName = null;
         }
         currentState = TAGPARSESTATE_NORMAL;
-        noteTag(currentTagName,currentAttrMap);
+        if (noteTag(currentTagName,currentAttrMap))
+          return true;
         currentTagName = null;
         currentAttrMap = null;
       }
@@ -242,7 +260,8 @@ public class TagParseState extends Singl
       else if (thisChar == '>')
       {
         currentState = TAGPARSESTATE_NORMAL;
-        noteTag(currentTagName,currentAttrMap);
+        if (noteTag(currentTagName,currentAttrMap))
+          return true;
         currentTagName = null;
         currentAttrMap = null;
       }
@@ -251,7 +270,8 @@ public class TagParseState extends Singl
         currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
         currentAttrMap.put(currentAttrName,"");
         currentAttrName = null;
-        noteTag(currentTagName,currentAttrMap);
+        if (noteTag(currentTagName,currentAttrMap))
+          return true;
       }
       else if (!isWhitespace(thisChar))
       {
@@ -276,7 +296,8 @@ public class TagParseState extends Singl
     case TAGPARSESTATE_IN_TAG_SAW_SLASH:
       if (thisChar == '>')
       {
-        noteEndTag(currentTagName);
+        if (noteEndTag(currentTagName))
+          return true;
         currentState = TAGPARSESTATE_NORMAL;
         currentTagName = null;
         currentAttrMap = null;
@@ -301,7 +322,8 @@ public class TagParseState extends Singl
         }
         if (currentTagName != null)
         {
-          noteEndTag(currentTagName);
+          if (noteEndTag(currentTagName))
+            return true;
         }
         currentTagName = null;
         currentState = TAGPARSESTATE_NORMAL;
@@ -345,7 +367,8 @@ public class TagParseState extends Singl
       else if (thisChar == '/')
       {
         currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
-        noteTag(currentTagName,currentAttrMap);
+        if (noteTag(currentTagName,currentAttrMap))
+          return true;
         currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
       }
       else if (thisChar == '>')
@@ -354,7 +377,8 @@ public class TagParseState extends Singl
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = TAGPARSESTATE_NORMAL;
-        noteTag(currentTagName,currentAttrMap);
+        if (noteTag(currentTagName,currentAttrMap))
+          return true;
         currentTagName = null;
         currentAttrMap = null;
       }
@@ -367,21 +391,93 @@ public class TagParseState extends Singl
     return false;
   }
 
-  protected void noteTag(String tagName, Map<String,String> attributes)
+  /** This method gets called for every tag.  Override this method to intercept tag begins.
+  *@return true to halt further processing.
+  */
+  protected boolean noteTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    Logging.misc.debug(" Saw tag '"+tagName+"'");
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw tag '"+tagName+"'");
+    return false;
   }
 
-  protected void noteEndTag(String tagName)
+  /** This method gets called for every end tag.  Override this method to intercept tag ends.
+  *@return true to halt further processing.
+  */
+  protected boolean noteEndTag(String tagName)
     throws ManifoldCFException
   {
-    Logging.misc.debug(" Saw end tag '"+tagName+"'");
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw end tag '"+tagName+"'");
+    return false;
   }
 
-  protected void noteNormalCharacter(char thisChar)
+  /** This method is called for every <? ... ?> construct, or 'qtag'.
+  * Override it to intercept such constructs.
+  *@return true to halt further processing.
+  */
+  protected boolean noteQTag(String tagName, Map<String,String> attributes)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw QTag '"+tagName+"'");
+    return false;
+  }
+  
+  /** This method is called for every <! <token> ... > construct, or 'btag'.
+  * Override it to intercept these.
+  *@return true to halt further processing.
+  */
+  protected boolean noteBTag(String tagName)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw BTag '"+tagName+"'");
+    return false;
+  }
+  
+  /** This method is called for the end of every btag, or any time
+  * there's a naked '>' in the document.  Override it if you want to intercept these.
+  *@return true to halt further processing.
+  */
+  protected boolean noteEndBTag()
+    throws ManifoldCFException
+  {
+    Logging.misc.debug(" Saw end BTag");
+    return false;
+  }
+  
+  /** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]>
+  *@param token may be null!!!
+  *@return true to halt further processing.
+  */
+  protected boolean noteEscaped(String token)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw escaped '"+((token==null)?null:token)+"'");
+    return false;
+  }
+  
+  /** Called for the end of every cdata-like tag.
+  *@return true to halt further processing.
+  */
+  protected boolean noteEndEscaped()
     throws ManifoldCFException
   {
+    Logging.misc.debug(" Saw end escaped");
+    return false;
+  }
+  
+  /** This method gets called for every character that is not part of a tag etc.
+  * Override this method to intercept such characters.
+  *@return true to halt further processing.
+  */
+  protected boolean noteNormalCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    return false;
   }
   
   /** Decode body text */