You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/06 12:37:54 UTC

svn commit: r1442918 - in /manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml: BOMParseState.java ByteReceiver.java CharacterReceiver.java EncodingDetector.java TagParseState.java

Author: kwright
Date: Wed Feb  6 11:37:54 2013
New Revision: 1442918

URL: http://svn.apache.org/viewvc?rev=1442918&view=rev
Log:
More revisions of structure, designed to make a generally useful fuzzy ml parser.

Added:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java   (with props)
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java   (with props)
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java   (with props)
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java   (with props)
Modified:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java Wed Feb  6 11:37:54 2013
@@ -0,0 +1,80 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class represents the parse state of the BOM (byte order mark) parser.
+* The byte order mark parser looks for a byte order mark at the start of a byte sequence,
+* and based on whether it finds it or not, and what it finds, selects a preliminary character encoding.
+* Once a preliminary character encoding is determined, an EncodingAccepter is notified,
+* and further bytes are sent to a provided ByteReceiver.
+*/
+public class BOMParseState extends EncodingDetector
+{
+  protected String encoding = null;
+  protected final ByteReceiver byteReceiver;
+  
+  /** Constructor.  Pass in the receiver of all overflow bytes.
+  * If no receiver is passed in, the detector will stop as soon as the
+  * BOM is either seen, or not seen.
+  */
+  public BOMParseState(ByteReceiver byteReceiver)
+  {
+    super(8);
+    this.byteReceiver = byteReceiver;
+  }
+  
+  /** Set initial encoding.
+  */
+  @Override
+  public void setEncoding(String encoding)
+  {
+    this.encoding = encoding;
+  }
+
+  /** Retrieve final encoding determination.
+  */
+  @Override
+  public String getEncoding()
+  {
+    return encoding;
+  }
+  
+  /** Receive a byte.
+  */
+  @Override
+  public boolean dealWithByte(byte b)
+    throws ManifoldCFException
+  {
+    // MHL
+    return true;
+  }
+  
+  /** Finish up all processing.
+  */
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+    // MHL
+  }
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java Wed Feb  6 11:37:54 2013
@@ -0,0 +1,65 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+
+/** This interface represents a receiver for bytes.
+* Implementers of this interface will accept documents a byte at a time
+*/
+public abstract class ByteReceiver
+{
+  protected final byte[] byteBuffer;
+  
+  /** Constructor */
+  public ByteReceiver(int chunkSize)
+  {
+    byteBuffer = new byte[chunkSize];
+  }
+  
+  /** Receive a byte stream and process up to chunksize bytes,
+  *@return true if end reached.
+  */
+  public boolean dealWithBytes(InputStream is)
+    throws IOException, ManifoldCFException
+  {
+    int amt = is.read(byteBuffer);
+    if (amt == -1)
+      return true;
+    for (int i = 0; i < amt; i++)
+    {
+      if (dealWithByte(byteBuffer[i]))
+        return true;
+    }
+    return false;
+  }
+  
+  /** Receive a byte.
+  *@return true to stop further processing.
+  */
+  public abstract boolean dealWithByte(byte b)
+    throws ManifoldCFException;
+  
+  /** Finish up all processing.
+  */
+  public abstract void finishUp()
+    throws ManifoldCFException;
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java Wed Feb  6 11:37:54 2013
@@ -0,0 +1,72 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+
+/** This interface represents a receiver for characters.
+* Implementers of this interface will accept documents a character at a time.
+*/
+public abstract class CharacterReceiver
+{
+  protected final char[] charBuffer;
+  
+  public CharacterReceiver()
+  {
+    this(4096);
+  }
+  
+  /** Constructor.
+  */
+  public CharacterReceiver(int chunkSize)
+  {
+    charBuffer = new char[chunkSize];
+  }
+  
+  /** Receive a set of characters; process one
+  * chunksize worth.
+  *@return true if done.
+  */
+  public boolean dealWithCharacters(Reader r)
+    throws IOException, ManifoldCFException
+  {
+    int amt = r.read(charBuffer);
+    if (amt == -1)
+      return true;
+    for (int i = 0; i < amt; i++)
+    {
+      if (dealWithCharacter(charBuffer[i]))
+        return true;
+    }
+    return false;
+  }
+  
+  /** Receive a byte.
+  * @return true if done.
+  */
+  public abstract boolean dealWithCharacter(char c)
+    throws ManifoldCFException;
+  
+  /** Finish up all processing.
+  */
+  public abstract void finishUp()
+    throws ManifoldCFException;
+
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java Wed Feb  6 11:37:54 2013
@@ -0,0 +1,51 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This interface represents a receiver for bytes.
+* Implementers of this interface will accept documents a byte at a time,
+* AFTER an encoding has been set.
+*/
+public abstract class EncodingDetector extends ByteReceiver
+{
+  protected String currentEncoding = null;
+  
+  /** Constructor */
+  public EncodingDetector(int chunkSize)
+  {
+    super(chunkSize);
+  }
+
+  /** Accept a starting encoding value.
+  */
+  public void setEncoding(String encoding)
+  {
+    currentEncoding = encoding;
+  }
+  
+  /** Read out the detected encoding, when finished.
+  */
+  public String getEncoding()
+  {
+    return currentEncoding;
+  }
+  
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1442918&r1=1442917&r2=1442918&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Wed Feb  6 11:37:54 2013
@@ -22,8 +22,8 @@ import org.apache.manifoldcf.core.interf
 import org.apache.manifoldcf.core.system.Logging;
 import java.util.*;
 
-/** This class represents the basic, outermost parse state. */
-public class TagParseState
+/** This class represents the basic, outermost tag parsing state. */
+public class TagParseState extends CharacterReceiver
 {
   protected static final int TAGPARSESTATE_NORMAL = 0;
   protected static final int TAGPARSESTATE_SAWLEFTBRACKET = 1;
@@ -67,8 +67,10 @@ public class TagParseState
   {
   }
 
-  /** Deal with a character.  No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty. */
-  public void dealWithCharacter(char thisChar)
+  /** Deal with a character.  No exceptions are allowed, since those would represent
+  * syntax errors, and we don't want those to cause difficulty. */
+  @Override
+  public boolean dealWithCharacter(char thisChar)
     throws ManifoldCFException
   {
     // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
@@ -361,6 +363,7 @@ public class TagParseState
     default:
       throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState));
     }
+    return false;
   }
 
   protected void noteTag(String tagName, Map<String,String> attributes)
@@ -380,6 +383,7 @@ public class TagParseState
   {
   }
   
+  @Override
   public void finishUp()
     throws ManifoldCFException
   {