You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/07 10:58:50 UTC

svn commit: r1443377 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: AttrNameValue.java TagParseState.java

Author: kwright
Date: Thu Feb  7 09:58:50 2013
New Revision: 1443377

URL: http://svn.apache.org/viewvc?rev=1443377&view=rev
Log:
Turn on case sensitivity, and start coding qtag recognition.

Added:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java   (with props)
Modified:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java?rev=1443377&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java Thu Feb  7 09:58:50 2013
@@ -0,0 +1,48 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class represents a name/value pair from an
+* XML/HTML attribute.
+*/
+public class AttrNameValue
+{
+  protected final String name;
+  protected final String value;
+  
+  public AttrNameValue(String name, String value)
+  {
+    this.name = name;
+    this.value = value;
+  }
+  
+  public String getName()
+  {
+    return name;
+  }
+  
+  public String getValue()
+  {
+    return value;
+  }
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1443377&r1=1443376&r2=1443377&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Thu Feb  7 09:58:50 2013
@@ -54,7 +54,15 @@ public class TagParseState extends Singl
   protected static final int TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13;
   protected static final int TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14;
   protected static final int TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15;
-
+  protected static final int TAGPARSESTATE_IN_QTAG_NAME = 16;
+  protected static final int TAGPARSESTATE_IN_QTAG_ATTR_NAME = 17;
+  protected static final int TAGPARSESTATE_IN_QTAG_SAW_QUESTION = 18;
+  
+  // These still need to be added to the case statement
+  protected static final int TAGPARSESTATE_IN_QTAG_ATTR_VALUE = 19;
+  protected static final int TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE = 20;
+  protected static final int TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE = 21;
+  protected static final int TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE = 22;
 
   protected int currentState = TAGPARSESTATE_NORMAL;
 
@@ -64,7 +72,7 @@ public class TagParseState extends Singl
 
   protected String currentTagName = null;
   protected String currentAttrName = null;
-  protected Map<String,String> currentAttrMap = null;
+  protected List<AttrNameValue> currentAttrList = null;
 
   protected static final Map<String,String> mapLookup = new HashMap<String,String>();
   static
@@ -88,7 +96,7 @@ public class TagParseState extends Singl
     throws ManifoldCFException
   {
     // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
-    char thisCharLower = Character.toLowerCase(thisChar);
+    // We don't even attempt to map to lower case, that's how naive this is.
     switch (currentState)
     {
     case TAGPARSESTATE_NORMAL:
@@ -101,6 +109,11 @@ public class TagParseState extends Singl
     case TAGPARSESTATE_SAWLEFTBRACKET:
       if (thisChar == '!')
         currentState = TAGPARSESTATE_SAWEXCLAMATION;
+      else if (thisChar == '?')
+      {
+        currentState = TAGPARSESTATE_IN_QTAG_NAME;
+        currentTagNameBuffer = new StringBuilder();
+      }
       else if (thisChar == '/')
       {
         currentState = TAGPARSESTATE_IN_END_TAG_NAME;
@@ -111,7 +124,7 @@ public class TagParseState extends Singl
         currentState = TAGPARSESTATE_IN_TAG_NAME;
         currentTagNameBuffer = new StringBuilder();
         if (!isWhitespace(thisChar))
-          currentTagNameBuffer.append(thisCharLower);
+          currentTagNameBuffer.append(thisChar);
       }
       break;
     case TAGPARSESTATE_SAWEXCLAMATION:
@@ -143,6 +156,55 @@ public class TagParseState extends Singl
       else if (thisChar != '-')
         currentState = TAGPARSESTATE_IN_COMMENT;
       break;
+    case TAGPARSESTATE_IN_QTAG_NAME:
+      if (isWhitespace(thisChar))
+      {
+        if (currentTagNameBuffer.length() > 0)
+        {
+          // Done with the tag name!
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+          currentAttrList = new ArrayList<AttrNameValue>();
+          currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME;
+          currentAttrNameBuffer = new StringBuilder();
+        }
+      }
+      else if (thisChar == '?')
+      {
+        if (currentTagNameBuffer.length() > 0)
+        {
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+          currentAttrList = new ArrayList<AttrNameValue>();
+          currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION;
+          // Wait until we see end > to signal tag end though
+        }
+        else
+        {
+          currentState = TAGPARSESTATE_NORMAL;
+          currentTagNameBuffer = null;
+        }
+      }
+      else if (thisChar == '>')
+      {
+        if (currentTagNameBuffer.length() > 0)
+        {
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+          currentAttrList = new ArrayList<AttrNameValue>();
+        }
+        if (currentTagName != null)
+        {
+          if (noteQTag(currentTagName,currentAttrList))
+            return true;
+        }
+        currentState = TAGPARSESTATE_NORMAL;
+        currentTagName = null;
+        currentAttrList = null;
+      }
+      else
+        currentTagNameBuffer.append(thisChar);
+      break;
     case TAGPARSESTATE_IN_TAG_NAME:
       if (isWhitespace(thisChar))
       {
@@ -151,7 +213,7 @@ public class TagParseState extends Singl
           // Done with the tag name!
           currentTagName = currentTagNameBuffer.toString();
           currentTagNameBuffer = null;
-          currentAttrMap = new HashMap<String,String>();
+          currentAttrList = new ArrayList<AttrNameValue>();
           currentState = TAGPARSESTATE_IN_ATTR_NAME;
           currentAttrNameBuffer = new StringBuilder();
         }
@@ -162,9 +224,9 @@ public class TagParseState extends Singl
         {
           currentTagName = currentTagNameBuffer.toString();
           currentTagNameBuffer = null;
-          currentAttrMap = new HashMap<String,String>();
+          currentAttrList = new ArrayList<AttrNameValue>();
           currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
-          if (noteTag(currentTagName,currentAttrMap))
+          if (noteTag(currentTagName,currentAttrList))
             return true;
         }
         else
@@ -179,19 +241,19 @@ public class TagParseState extends Singl
         {
           currentTagName = currentTagNameBuffer.toString();
           currentTagNameBuffer = null;
-          currentAttrMap = new HashMap<String,String>();
+          currentAttrList = new ArrayList<AttrNameValue>();
         }
         if (currentTagName != null)
         {
-          if (noteTag(currentTagName,currentAttrMap))
+          if (noteTag(currentTagName,currentAttrList))
             return true;
         }
         currentState = TAGPARSESTATE_NORMAL;
         currentTagName = null;
-        currentAttrMap = null;
+        currentAttrList = null;
       }
       else
-        currentTagNameBuffer.append(thisCharLower);
+        currentTagNameBuffer.append(thisChar);
       break;
     case TAGPARSESTATE_IN_ATTR_NAME:
       if (isWhitespace(thisChar))
@@ -223,10 +285,10 @@ public class TagParseState extends Singl
         }
         if (currentAttrName != null)
         {
-          currentAttrMap.put(currentAttrName,"");
+          currentAttrList.add(new AttrNameValue(currentAttrName,""));
           currentAttrName = null;
         }
-        if (noteTag(currentTagName,currentAttrMap))
+        if (noteTag(currentTagName,currentAttrList))
           return true;
         currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
       }
@@ -239,17 +301,17 @@ public class TagParseState extends Singl
         }
         if (currentAttrName != null)
         {
-          currentAttrMap.put(currentAttrName,"");
+          currentAttrList.add(new AttrNameValue(currentAttrName,""));
           currentAttrName = null;
         }
         currentState = TAGPARSESTATE_NORMAL;
-        if (noteTag(currentTagName,currentAttrMap))
+        if (noteTag(currentTagName,currentAttrList))
           return true;
         currentTagName = null;
-        currentAttrMap = null;
+        currentAttrList = null;
       }
       else
-        currentAttrNameBuffer.append(thisCharLower);
+        currentAttrNameBuffer.append(thisChar);
       break;
     case TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE:
       if (thisChar == '=')
@@ -260,25 +322,25 @@ public class TagParseState extends Singl
       else if (thisChar == '>')
       {
         currentState = TAGPARSESTATE_NORMAL;
-        if (noteTag(currentTagName,currentAttrMap))
+        if (noteTag(currentTagName,currentAttrList))
           return true;
         currentTagName = null;
-        currentAttrMap = null;
+        currentAttrList = null;
       }
       else if (thisChar == '/')
       {
         currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
-        currentAttrMap.put(currentAttrName,"");
+        currentAttrList.add(new AttrNameValue(currentAttrName,""));
         currentAttrName = null;
-        if (noteTag(currentTagName,currentAttrMap))
+        if (noteTag(currentTagName,currentAttrList))
           return true;
       }
       else if (!isWhitespace(thisChar))
       {
-        currentAttrMap.put(currentAttrName,"");
+        currentAttrList.add(new AttrNameValue(currentAttrName,""));
         currentState = TAGPARSESTATE_IN_ATTR_NAME;
         currentAttrNameBuffer = new StringBuilder();
-        currentAttrNameBuffer.append(thisCharLower);
+        currentAttrNameBuffer.append(thisChar);
         currentAttrName = null;
       }
       break;
@@ -293,6 +355,16 @@ public class TagParseState extends Singl
         currentValueBuffer.append(thisChar);
       }
       break;
+    case TAGPARSESTATE_IN_QTAG_SAW_QUESTION:
+      if (thisChar == '>')
+      {
+        if (noteQTag(currentTagName,currentAttrList))
+          return true;
+        currentState = TAGPARSESTATE_NORMAL;
+        currentTagName = null;
+        currentAttrList = null;
+      }
+      break;
     case TAGPARSESTATE_IN_TAG_SAW_SLASH:
       if (thisChar == '>')
       {
@@ -300,7 +372,7 @@ public class TagParseState extends Singl
           return true;
         currentState = TAGPARSESTATE_NORMAL;
         currentTagName = null;
-        currentAttrMap = null;
+        currentAttrList = null;
       }
       break;
     case TAGPARSESTATE_IN_END_TAG_NAME:
@@ -329,12 +401,12 @@ public class TagParseState extends Singl
         currentState = TAGPARSESTATE_NORMAL;
       }
       else if (currentTagNameBuffer != null)
-        currentTagNameBuffer.append(thisCharLower);
+        currentTagNameBuffer.append(thisChar);
       break;
     case TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
       if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
       {
-        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+        currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = TAGPARSESTATE_IN_ATTR_NAME;
@@ -346,7 +418,7 @@ public class TagParseState extends Singl
     case TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
       if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
       {
-        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+        currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = TAGPARSESTATE_IN_ATTR_NAME;
@@ -358,7 +430,7 @@ public class TagParseState extends Singl
     case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
       if (isWhitespace(thisChar))
       {
-        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+        currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = TAGPARSESTATE_IN_ATTR_NAME;
@@ -366,21 +438,21 @@ public class TagParseState extends Singl
       }
       else if (thisChar == '/')
       {
-        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
-        if (noteTag(currentTagName,currentAttrMap))
+        currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
+        if (noteTag(currentTagName,currentAttrList))
           return true;
         currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
       }
       else if (thisChar == '>')
       {
-        currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+        currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
         currentAttrName = null;
         currentValueBuffer = null;
         currentState = TAGPARSESTATE_NORMAL;
-        if (noteTag(currentTagName,currentAttrMap))
+        if (noteTag(currentTagName,currentAttrList))
           return true;
         currentTagName = null;
-        currentAttrMap = null;
+        currentAttrList = null;
       }
       else
         currentValueBuffer.append(thisChar);
@@ -394,7 +466,7 @@ public class TagParseState extends Singl
   /** This method gets called for every tag.  Override this method to intercept tag begins.
   *@return true to halt further processing.
   */
-  protected boolean noteTag(String tagName, Map<String,String> attributes)
+  protected boolean noteTag(String tagName, List<AttrNameValue> attributes)
     throws ManifoldCFException
   {
     if (Logging.misc.isDebugEnabled())
@@ -417,7 +489,7 @@ public class TagParseState extends Singl
   * Override it to intercept such constructs.
   *@return true to halt further processing.
   */
-  protected boolean noteQTag(String tagName, Map<String,String> attributes)
+  protected boolean noteQTag(String tagName, List<AttrNameValue> attributes)
     throws ManifoldCFException
   {
     if (Logging.misc.isDebugEnabled())