You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/06 01:46:55 UTC
svn commit: r1442817 - in
/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml:
./ BasicParseState.java
Author: kwright
Date: Wed Feb 6 00:46:55 2013
New Revision: 1442817
URL: http://svn.apache.org/viewvc?rev=1442817&view=rev
Log:
Add partially reworked basic parse state. Next step: put in the part that autodetects encodings, but defaults to a passed-in value if no encoding preamble.
Added:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java
- copied, changed from r1442812, manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
Copied: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java (from r1442812, manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java)
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java?p2=manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java&p1=manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java&r1=1442812&r2=1442817&rev=1442817&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BasicParseState.java Wed Feb 6 00:46:55 2013
@@ -16,10 +16,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.manifoldcf.crawler.connectors.webcrawler;
+package org.apache.manifoldcf.core.fuzzyml;
import org.apache.manifoldcf.core.interfaces.*;
-import org.apache.manifoldcf.crawler.system.Logging;
+import org.apache.manifoldcf.core.system.Logging;
import java.util.*;
/** This class represents the basic, outermost parse state. */
@@ -51,9 +51,9 @@ public class BasicParseState
protected String currentTagName = null;
protected String currentAttrName = null;
- protected Map currentAttrMap = null;
+ protected Map<String,String> currentAttrMap = null;
- protected static final Map mapLookup = new HashMap();
+ protected static final Map<String,String> mapLookup = new HashMap<String,String>();
static
{
mapLookup.put("amp","&");
@@ -133,7 +133,7 @@ public class BasicParseState
// Done with the tag name!
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
- currentAttrMap = new HashMap();
+ currentAttrMap = new HashMap<String,String>();
currentState = BASICPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = new StringBuilder();
}
@@ -144,7 +144,7 @@ public class BasicParseState
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
- currentAttrMap = new HashMap();
+ currentAttrMap = new HashMap<String,String>();
currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
noteTag(currentTagName,currentAttrMap);
}
@@ -160,7 +160,7 @@ public class BasicParseState
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
- currentAttrMap = new HashMap();
+ currentAttrMap = new HashMap<String,String>();
}
if (currentTagName != null)
{
@@ -308,7 +308,7 @@ public class BasicParseState
case BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
{
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
currentAttrName = null;
currentValueBuffer = null;
currentState = BASICPARSESTATE_IN_ATTR_NAME;
@@ -320,7 +320,7 @@ public class BasicParseState
case BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
{
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
currentAttrName = null;
currentValueBuffer = null;
currentState = BASICPARSESTATE_IN_ATTR_NAME;
@@ -332,7 +332,7 @@ public class BasicParseState
case BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
if (isHTMLWhitespace(thisChar))
{
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
currentAttrName = null;
currentValueBuffer = null;
currentState = BASICPARSESTATE_IN_ATTR_NAME;
@@ -340,13 +340,13 @@ public class BasicParseState
}
else if (thisChar == '/')
{
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
noteTag(currentTagName,currentAttrMap);
currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
}
else if (thisChar == '>')
{
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrMap.put(currentAttrName,attributeDecode(currentValueBuffer.toString()));
currentAttrName = null;
currentValueBuffer = null;
currentState = BASICPARSESTATE_NORMAL;
@@ -362,16 +362,16 @@ public class BasicParseState
}
}
- protected void noteTag(String tagName, Map attributes)
+ protected void noteTag(String tagName, Map<String,String> attributes)
throws ManifoldCFException
{
- Logging.connectors.debug(" Saw tag '"+tagName+"'");
+ Logging.misc.debug(" Saw tag '"+tagName+"'");
}
protected void noteEndTag(String tagName)
throws ManifoldCFException
{
- Logging.connectors.debug(" Saw end tag '"+tagName+"'");
+ Logging.misc.debug(" Saw end tag '"+tagName+"'");
}
protected void noteNormalCharacter(char thisChar)
@@ -385,14 +385,14 @@ public class BasicParseState
// Does nothing
}
- /** Decode html body text */
- protected static String htmlBodyDecode(String input)
+ /** Decode body text */
+ protected static String bodyDecode(String input)
{
- return htmlAttributeDecode(input);
+ return attributeDecode(input);
}
/** Decode an html attribute */
- protected static String htmlAttributeDecode(String input)
+ protected static String attributeDecode(String input)
{
StringBuilder output = new StringBuilder();
int i = 0;
@@ -438,7 +438,7 @@ public class BasicParseState
}
}
else
- return (String)mapLookup.get(input);
+ return mapLookup.get(input);
}
/** Is a character HTML whitespace? */