You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/07 10:58:50 UTC
svn commit: r1443377 - in
/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml:
AttrNameValue.java TagParseState.java
Author: kwright
Date: Thu Feb 7 09:58:50 2013
New Revision: 1443377
URL: http://svn.apache.org/viewvc?rev=1443377&view=rev
Log:
Turn on case sensitivity, and start coding qtag recognition.
Added:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java (with props)
Modified:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java?rev=1443377&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java Thu Feb 7 09:58:50 2013
@@ -0,0 +1,48 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class represents a name/value pair from an
+* XML/HTML attribute.
+*/
+public class AttrNameValue
+{
+ protected final String name;
+ protected final String value;
+
+ public AttrNameValue(String name, String value)
+ {
+ this.name = name;
+ this.value = value;
+ }
+
+ public String getName()
+ {
+ return name;
+ }
+
+ public String getValue()
+ {
+ return value;
+ }
+
+}
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/AttrNameValue.java
------------------------------------------------------------------------------
svn:keywords = Id
Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1443377&r1=1443376&r2=1443377&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Thu Feb 7 09:58:50 2013
@@ -54,7 +54,15 @@ public class TagParseState extends Singl
protected static final int TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13;
protected static final int TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14;
protected static final int TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15;
-
+ protected static final int TAGPARSESTATE_IN_QTAG_NAME = 16;
+ protected static final int TAGPARSESTATE_IN_QTAG_ATTR_NAME = 17;
+ protected static final int TAGPARSESTATE_IN_QTAG_SAW_QUESTION = 18;
+
+ // These still need to be added to the case statement
+ protected static final int TAGPARSESTATE_IN_QTAG_ATTR_VALUE = 19;
+ protected static final int TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE = 20;
+ protected static final int TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE = 21;
+ protected static final int TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE = 22;
protected int currentState = TAGPARSESTATE_NORMAL;
@@ -64,7 +72,7 @@ public class TagParseState extends Singl
protected String currentTagName = null;
protected String currentAttrName = null;
- protected Map<String,String> currentAttrMap = null;
+ protected List<AttrNameValue> currentAttrList = null;
protected static final Map<String,String> mapLookup = new HashMap<String,String>();
static
@@ -88,7 +96,7 @@ public class TagParseState extends Singl
throws ManifoldCFException
{
// At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
- char thisCharLower = Character.toLowerCase(thisChar);
+ // We don't even attempt to map to lower case, that's how naive this is.
switch (currentState)
{
case TAGPARSESTATE_NORMAL:
@@ -101,6 +109,11 @@ public class TagParseState extends Singl
case TAGPARSESTATE_SAWLEFTBRACKET:
if (thisChar == '!')
currentState = TAGPARSESTATE_SAWEXCLAMATION;
+ else if (thisChar == '?')
+ {
+ currentState = TAGPARSESTATE_IN_QTAG_NAME;
+ currentTagNameBuffer = new StringBuilder();
+ }
else if (thisChar == '/')
{
currentState = TAGPARSESTATE_IN_END_TAG_NAME;
@@ -111,7 +124,7 @@ public class TagParseState extends Singl
currentState = TAGPARSESTATE_IN_TAG_NAME;
currentTagNameBuffer = new StringBuilder();
if (!isWhitespace(thisChar))
- currentTagNameBuffer.append(thisCharLower);
+ currentTagNameBuffer.append(thisChar);
}
break;
case TAGPARSESTATE_SAWEXCLAMATION:
@@ -143,6 +156,55 @@ public class TagParseState extends Singl
else if (thisChar != '-')
currentState = TAGPARSESTATE_IN_COMMENT;
break;
+ case TAGPARSESTATE_IN_QTAG_NAME:
+ if (isWhitespace(thisChar))
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ // Done with the tag name!
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentAttrList = new ArrayList<AttrNameValue>();
+ currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME;
+ currentAttrNameBuffer = new StringBuilder();
+ }
+ }
+ else if (thisChar == '?')
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentAttrList = new ArrayList<AttrNameValue>();
+ currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION;
+ // Wait until we see end > to signal tag end though
+ }
+ else
+ {
+ currentState = TAGPARSESTATE_NORMAL;
+ currentTagNameBuffer = null;
+ }
+ }
+ else if (thisChar == '>')
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentAttrList = new ArrayList<AttrNameValue>();
+ }
+ if (currentTagName != null)
+ {
+ if (noteQTag(currentTagName,currentAttrList))
+ return true;
+ }
+ currentState = TAGPARSESTATE_NORMAL;
+ currentTagName = null;
+ currentAttrList = null;
+ }
+ else
+ currentTagNameBuffer.append(thisChar);
+ break;
case TAGPARSESTATE_IN_TAG_NAME:
if (isWhitespace(thisChar))
{
@@ -151,7 +213,7 @@ public class TagParseState extends Singl
// Done with the tag name!
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
- currentAttrMap = new HashMap<String,String>();
+ currentAttrList = new ArrayList<AttrNameValue>();
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = new StringBuilder();
}
@@ -162,9 +224,9 @@ public class TagParseState extends Singl
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
- currentAttrMap = new HashMap<String,String>();
+ currentAttrList = new ArrayList<AttrNameValue>();
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
- if (noteTag(currentTagName,currentAttrMap))
+ if (noteTag(currentTagName,currentAttrList))
return true;
}
else
@@ -179,19 +241,19 @@ public class TagParseState extends Singl
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
- currentAttrMap = new HashMap<String,String>();
+ currentAttrList = new ArrayList<AttrNameValue>();
}
if (currentTagName != null)
{
- if (noteTag(currentTagName,currentAttrMap))
+ if (noteTag(currentTagName,currentAttrList))
return true;
}
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
- currentAttrMap = null;
+ currentAttrList = null;
}
else
- currentTagNameBuffer.append(thisCharLower);
+ currentTagNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_ATTR_NAME:
if (isWhitespace(thisChar))
@@ -223,10 +285,10 @@ public class TagParseState extends Singl
}
if (currentAttrName != null)
{
- currentAttrMap.put(currentAttrName,"");
+ currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
}
- if (noteTag(currentTagName,currentAttrMap))
+ if (noteTag(currentTagName,currentAttrList))
return true;
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
}
@@ -239,17 +301,17 @@ public class TagParseState extends Singl
}
if (currentAttrName != null)
{
- currentAttrMap.put(currentAttrName,"");
+ currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
}
currentState = TAGPARSESTATE_NORMAL;
- if (noteTag(currentTagName,currentAttrMap))
+ if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
- currentAttrMap = null;
+ currentAttrList = null;
}
else
- currentAttrNameBuffer.append(thisCharLower);
+ currentAttrNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE:
if (thisChar == '=')
@@ -260,25 +322,25 @@ public class TagParseState extends Singl
else if (thisChar == '>')
{
currentState = TAGPARSESTATE_NORMAL;
- if (noteTag(currentTagName,currentAttrMap))
+ if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
- currentAttrMap = null;
+ currentAttrList = null;
}
else if (thisChar == '/')
{
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
- currentAttrMap.put(currentAttrName,"");
+ currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
- if (noteTag(currentTagName,currentAttrMap))
+ if (noteTag(currentTagName,currentAttrList))
return true;
}
else if (!isWhitespace(thisChar))
{
- currentAttrMap.put(currentAttrName,"");
+ currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = new StringBuilder();
- currentAttrNameBuffer.append(thisCharLower);
+ currentAttrNameBuffer.append(thisChar);
currentAttrName = null;
}
break;
@@ -293,6 +355,16 @@ public class TagParseState extends Singl
currentValueBuffer.append(thisChar);
}
break;
+ case TAGPARSESTATE_IN_QTAG_SAW_QUESTION:
+ if (thisChar == '>')
+ {
+ if (noteQTag(currentTagName,currentAttrList))
+ return true;
+ currentState = TAGPARSESTATE_NORMAL;
+ currentTagName = null;
+ currentAttrList = null;
+ }
+ break;
case TAGPARSESTATE_IN_TAG_SAW_SLASH:
if (thisChar == '>')
{
@@ -300,7 +372,7 @@ public class TagParseState extends Singl
return true;
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
- currentAttrMap = null;
+ currentAttrList = null;
}
break;
case TAGPARSESTATE_IN_END_TAG_NAME:
@@ -329,12 +401,12 @@ public class TagParseState extends Singl
currentState = TAGPARSESTATE_NORMAL;
}
else if (currentTagNameBuffer != null)
- currentTagNameBuffer.append(thisCharLower);
+ currentTagNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
{
- currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+ currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_NAME;
@@ -346,7 +418,7 @@ public class TagParseState extends Singl
case TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
{
- currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+ currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_NAME;
@@ -358,7 +430,7 @@ public class TagParseState extends Singl
case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
if (isWhitespace(thisChar))
{
- currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+ currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_NAME;
@@ -366,21 +438,21 @@ public class TagParseState extends Singl
}
else if (thisChar == '/')
{
- currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
- if (noteTag(currentTagName,currentAttrMap))
+ currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
+ if (noteTag(currentTagName,currentAttrList))
return true;
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
}
else if (thisChar == '>')
{
- currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
+ currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_NORMAL;
- if (noteTag(currentTagName,currentAttrMap))
+ if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
- currentAttrMap = null;
+ currentAttrList = null;
}
else
currentValueBuffer.append(thisChar);
@@ -394,7 +466,7 @@ public class TagParseState extends Singl
/** This method gets called for every tag. Override this method to intercept tag begins.
*@return true to halt further processing.
*/
- protected boolean noteTag(String tagName, Map<String,String> attributes)
+ protected boolean noteTag(String tagName, List<AttrNameValue> attributes)
throws ManifoldCFException
{
if (Logging.misc.isDebugEnabled())
@@ -417,7 +489,7 @@ public class TagParseState extends Singl
* Override it to intercept such constructs.
*@return true to halt further processing.
*/
- protected boolean noteQTag(String tagName, Map<String,String> attributes)
+ protected boolean noteQTag(String tagName, List<AttrNameValue> attributes)
throws ManifoldCFException
{
if (Logging.misc.isDebugEnabled())