You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/06 01:46:55 UTC

svn commit: r1442817 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: ./ BasicParseState.java

Author: kwright
Date: Wed Feb  6 00:46:55 2013
New Revision: 1442817

URL: http://svn.apache.org/viewvc?rev=1442817&view=rev
Log:
Add partially reworked basic parse state.  Next step: put in the part that autodetects encodings, but defaults to a passed-in value if no encoding preamble.

Added:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java
      - copied, changed from r1442812, manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java

Copied: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java (from r1442812, manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java)
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java?p2=manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java&p1=manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java&r1=1442812&r2=1442817&rev=1442817&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java Wed Feb  6 00:46:55 2013
@@ -16,10 +16,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.manifoldcf.crawler.connectors.webcrawler;
+package org.apache.manifoldcf.core.fuzzyml;
 
 import org.apache.manifoldcf.core.interfaces.*;
-import org.apache.manifoldcf.crawler.system.Logging;
+import org.apache.manifoldcf.core.system.Logging;
 import java.util.*;
 
 /** This class represents the basic, outermost parse state. */
@@ -51,9 +51,9 @@ public class BasicParseState
 
   protected String currentTagName = null;
   protected String currentAttrName = null;
-  protected Map currentAttrMap = null;
+  protected Map<String,String> currentAttrMap = null;
 
-  protected static final Map mapLookup = new HashMap();
+  protected static final Map<String,String> mapLookup = new HashMap<String,String>();
   static
   {
     mapLookup.put("amp","&");
@@ -133,7 +133,7 @@ public class BasicParseState
           // Done with the tag name!
           currentTagName = currentTagNameBuffer.toString();
           currentTagNameBuffer = null;
-          currentAttrMap = new HashMap();
+          currentAttrMap = new HashMap<String,String>();
           currentState = BASICPARSESTATE_IN_ATTR_NAME;
           currentAttrNameBuffer = new StringBuilder();
         }
@@ -144,7 +144,7 @@ public class BasicParseState
         {
           currentTagName = currentTagNameBuffer.toString();
           currentTagNameBuffer = null;
-          currentAttrMap = new HashMap();
+          currentAttrMap = new HashMap<String,String>();
           currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
           noteTag(currentTagName,currentAttrMap);
         }
@@ -160,7 +160,7 @@ public class BasicParseState
         {
           currentTagName = currentTagNameBuffer.toString();
           currentTagNameBuffer = null;
-          currentAttrMap = new HashMap();
+          currentAttrMap = new HashMap<String,String>();
         }
         if (currentTagName != null)
         {
@@ -308,7 +308,7 @@ public class BasicParseState
     case BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
       if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
       {
-        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = BASICPARSESTATE_IN_ATTR_NAME;
@@ -320,7 +320,7 @@ public class BasicParseState
     case BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
       if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
       {
-        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = BASICPARSESTATE_IN_ATTR_NAME;
@@ -332,7 +332,7 @@ public class BasicParseState
     case BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
       if (isHTMLWhitespace(thisChar))
       {
-        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = BASICPARSESTATE_IN_ATTR_NAME;
@@ -340,13 +340,13 @@ public class BasicParseState
       }
       else if (thisChar == '/')
       {
-        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
         noteTag(currentTagName,currentAttrMap);
         currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
       }
       else if (thisChar == '>')
       {
-        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = BASICPARSESTATE_NORMAL;
@@ -362,16 +362,16 @@ public class BasicParseState
     }
   }
 
-  protected void noteTag(String tagName, Map attributes)
+  protected void noteTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    Logging.connectors.debug(" Saw tag '"+tagName+"'");
+    Logging.misc.debug(" Saw tag '"+tagName+"'");
   }
 
   protected void noteEndTag(String tagName)
     throws ManifoldCFException
   {
-    Logging.connectors.debug(" Saw end tag '"+tagName+"'");
+    Logging.misc.debug(" Saw end tag '"+tagName+"'");
   }
 
   protected void noteNormalCharacter(char thisChar)
@@ -385,14 +385,14 @@ public class BasicParseState
     // Does nothing
   }
 
-  /** Decode html body text */
-  protected static String htmlBodyDecode(String input)
+  /** Decode body text */
+  protected static String bodyDecode(String input)
   {
-    return htmlAttributeDecode(input);
+    return attributeDecode(input);
   }
   
   /** Decode an html attribute */
-  protected static String htmlAttributeDecode(String input)
+  protected static String attributeDecode(String input)
   {
     StringBuilder output = new StringBuilder();
     int i = 0;
@@ -438,7 +438,7 @@ public class BasicParseState
       }
     }
     else
-      return (String)mapLookup.get(input);
+      return mapLookup.get(input);
   }
 
   /** Is a character HTML whitespace? */