You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/08 03:40:39 UTC

svn commit: r1443826 - /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java

Author: kwright
Date: Fri Feb  8 02:40:39 2013
New Revision: 1443826

URL: http://svn.apache.org/r1443826
Log:
Add HTMLParseState, which disables handling of all XML-ish things, and maps html tag names etc to lower case

Added:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java   (with props)

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java?rev=1443826&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java Fri Feb  8 02:40:39 2013
@@ -0,0 +1,163 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.system.Logging;
+
+import java.util.*;
+import java.io.*;
+
+/** This class takes the output of the basic tag parser and converts it for
+* typical HTML usage.  It takes the attribute lists, for instance, and converts
+* them to lowercased maps.  It also bashes all tag names etc to lower case as
+* well.
+*/
+public class HTMLParseState extends TagParseState
+{
+  
+  /** Constructor.
+  */
+  public HTMLParseState()
+  {
+  }
+  
+  /** This method gets called for every tag.  Override this method to intercept tag begins.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteTag(String tagName, List<AttrNameValue> attributes)
+    throws ManifoldCFException
+  {
+    Map<String,String> attrMap = new HashMap<String,String>(attributes.size());
+    for (AttrNameValue nv : attributes)
+    {
+      attrMap.put(nv.getName().toLowerCase(Locale.ROOT), nv.getValue());
+    }
+    return noteTag(tagName.toLowerCase(Locale.ROOT), attrMap);
+  }
+
+  /** Map version of the noteTag method.
+  *@return true to halt further processing.
+  */
+  protected boolean noteTag(String tagName, Map<String,String> attributes)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw tag '"+tagName+"'");
+    return false;
+  }
+
+  /** This method gets called for every end tag.  Override this method to intercept tag ends.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteEndTag(String tagName)
+    throws ManifoldCFException
+  {
+    return noteTagEnd(tagName.toLowerCase(Locale.ROOT));
+  }
+
+  /** Note end tag.
+  */
+  protected boolean noteTagEnd(String tagName)
+    throws ManifoldCFException
+  {
+    if (Logging.misc.isDebugEnabled())
+      Logging.misc.debug(" Saw end tag '"+tagName+"'");
+    return false;
+  }
+  
+  /** This method is called for every <? ... ?> construct, or 'qtag'.
+  * This is not useful for HTML.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteQTag(String tagName, List<AttrNameValue> attributes)
+    throws ManifoldCFException
+  {
+    return super.noteQTag(tagName, attributes);
+  }
+
+  /** This method is called for every <! <token> ... > construct, or 'btag'.
+  * Override it to intercept these.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteBTag(String tagName)
+    throws ManifoldCFException
+  {
+    return super.noteBTag(tagName);
+  }
+
+  /** This method is called for the end of every btag, or any time
+  * there's a naked '>' in the document.  Override it if you want to intercept these.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteEndBTag()
+    throws ManifoldCFException
+  {
+    return super.noteEndBTag();
+  }
+
+  /** Called for the start of every cdata-like tag, e.g. <![ <token> [ ... ]]>
+  *@param token may be empty!!!
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteEscaped(String token)
+    throws ManifoldCFException
+  {
+    return super.noteEscaped(token);
+  }
+
+  /** Called for the end of every cdata-like tag.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteEndEscaped()
+    throws ManifoldCFException
+  {
+    return super.noteEndEscaped();
+  }
+
+  /** This method gets called for every token inside a btag.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteBTagToken(String token)
+    throws ManifoldCFException
+  {
+    return super.noteBTagToken(token);
+  }
+
+  /** This method gets called for every character that is found within an
+  * escape block, e.g. CDATA.
+  * Override this method to intercept such characters.
+  *@return true to halt further processing.
+  */
+  @Override
+  protected final boolean noteEscapedCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    return super.noteEscapedCharacter(thisChar);
+  }
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/HTMLParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id