You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/06 12:37:54 UTC
svn commit: r1442918 - in
/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml:
BOMParseState.java ByteReceiver.java CharacterReceiver.java
EncodingDetector.java TagParseState.java
Author: kwright
Date: Wed Feb 6 11:37:54 2013
New Revision: 1442918
URL: http://svn.apache.org/viewvc?rev=1442918&view=rev
Log:
More revisions of structure, designed to make a generally useful fuzzy ml parser.
Added:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java (with props)
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java (with props)
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java (with props)
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java (with props)
Modified:
manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java Wed Feb 6 11:37:54 2013
@@ -0,0 +1,80 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class represents the parse state of the BOM (byte order mark) parser.
+* The byte order mark parser looks for a byte order mark at the start of a byte sequence,
+* and based on whether it finds it or not, and what it finds, selects a preliminary character encoding.
+* Once a preliminary character encoding is determined, an EncodingAccepter is notified,
+* and further bytes are sent to a provided ByteReceiver.
+*/
+public class BOMParseState extends EncodingDetector
+{
+ protected String encoding = null;
+ protected final ByteReceiver byteReceiver;
+
+ /** Constructor. Pass in the receiver of all overflow bytes.
+ * If no receiver is passed in, the detector will stop as soon as the
+ * BOM is either seen, or not seen.
+ */
+ public BOMParseState(ByteReceiver byteReceiver)
+ {
+ super(8);
+ this.byteReceiver = byteReceiver;
+ }
+
+ /** Set initial encoding.
+ */
+ @Override
+ public void setEncoding(String encoding)
+ {
+ this.encoding = encoding;
+ }
+
+ /** Retrieve final encoding determination.
+ */
+ @Override
+ public String getEncoding()
+ {
+ return encoding;
+ }
+
+ /** Receive a byte.
+ */
+ @Override
+ public boolean dealWithByte(byte b)
+ throws ManifoldCFException
+ {
+ // MHL
+ return true;
+ }
+
+ /** Finish up all processing.
+ */
+ @Override
+ public void finishUp()
+ throws ManifoldCFException
+ {
+ // MHL
+ }
+
+}
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/BOMParseState.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java Wed Feb 6 11:37:54 2013
@@ -0,0 +1,65 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+
+/** This interface represents a receiver for bytes.
+* Implementers of this interface will accept documents a byte at a time
+*/
+public abstract class ByteReceiver
+{
+ protected final byte[] byteBuffer;
+
+ /** Constructor */
+ public ByteReceiver(int chunkSize)
+ {
+ byteBuffer = new byte[chunkSize];
+ }
+
+ /** Receive a byte stream and process up to chunksize bytes,
+ *@return true if end reached.
+ */
+ public boolean dealWithBytes(InputStream is)
+ throws IOException, ManifoldCFException
+ {
+ int amt = is.read(byteBuffer);
+ if (amt == -1)
+ return true;
+ for (int i = 0; i < amt; i++)
+ {
+ if (dealWithByte(byteBuffer[i]))
+ return true;
+ }
+ return false;
+ }
+
+ /** Receive a byte.
+ *@return true to stop further processing.
+ */
+ public abstract boolean dealWithByte(byte b)
+ throws ManifoldCFException;
+
+ /** Finish up all processing.
+ */
+ public abstract void finishUp()
+ throws ManifoldCFException;
+
+}
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/ByteReceiver.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java Wed Feb 6 11:37:54 2013
@@ -0,0 +1,72 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+
+/** This interface represents a receiver for characters.
+* Implementers of this interface will accept documents a character at a time.
+*/
+public abstract class CharacterReceiver
+{
+ protected final char[] charBuffer;
+
+ public CharacterReceiver()
+ {
+ this(4096);
+ }
+
+ /** Constructor.
+ */
+ public CharacterReceiver(int chunkSize)
+ {
+ charBuffer = new char[chunkSize];
+ }
+
+ /** Receive a set of characters; process one
+ * chunksize worth.
+ *@return true if done.
+ */
+ public boolean dealWithCharacters(Reader r)
+ throws IOException, ManifoldCFException
+ {
+ int amt = r.read(charBuffer);
+ if (amt == -1)
+ return true;
+ for (int i = 0; i < amt; i++)
+ {
+ if (dealWithCharacter(charBuffer[i]))
+ return true;
+ }
+ return false;
+ }
+
+ /** Receive a byte.
+ * @return true if done.
+ */
+ public abstract boolean dealWithCharacter(char c)
+ throws ManifoldCFException;
+
+ /** Finish up all processing.
+ */
+ public abstract void finishUp()
+ throws ManifoldCFException;
+
+}
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/CharacterReceiver.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java?rev=1442918&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java Wed Feb 6 11:37:54 2013
@@ -0,0 +1,51 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This interface represents a receiver for bytes.
+* Implementers of this interface will accept documents a byte at a time,
+* AFTER an encoding has been set.
+*/
+public abstract class EncodingDetector extends ByteReceiver
+{
+ protected String currentEncoding = null;
+
+ /** Constructor */
+ public EncodingDetector(int chunkSize)
+ {
+ super(chunkSize);
+ }
+
+ /** Accept a starting encoding value.
+ */
+ public void setEncoding(String encoding)
+ {
+ currentEncoding = encoding;
+ }
+
+ /** Read out the detected encoding, when finished.
+ */
+ public String getEncoding()
+ {
+ return currentEncoding;
+ }
+
+}
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/EncodingDetector.java
------------------------------------------------------------------------------
svn:keywords = Id
Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java?rev=1442918&r1=1442917&r2=1442918&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/TagParseState.java Wed Feb 6 11:37:54 2013
@@ -22,8 +22,8 @@ import org.apache.manifoldcf.core.interf
import org.apache.manifoldcf.core.system.Logging;
import java.util.*;
-/** This class represents the basic, outermost parse state. */
-public class TagParseState
+/** This class represents the basic, outermost tag parsing state. */
+public class TagParseState extends CharacterReceiver
{
protected static final int TAGPARSESTATE_NORMAL = 0;
protected static final int TAGPARSESTATE_SAWLEFTBRACKET = 1;
@@ -67,8 +67,10 @@ public class TagParseState
{
}
- /** Deal with a character. No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty. */
- public void dealWithCharacter(char thisChar)
+ /** Deal with a character. No exceptions are allowed, since those would represent
+ * syntax errors, and we don't want those to cause difficulty. */
+ @Override
+ public boolean dealWithCharacter(char thisChar)
throws ManifoldCFException
{
// At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
@@ -361,6 +363,7 @@ public class TagParseState
default:
throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState));
}
+ return false;
}
protected void noteTag(String tagName, Map<String,String> attributes)
@@ -380,6 +383,7 @@ public class TagParseState
{
}
+ @Override
public void finishUp()
throws ManifoldCFException
{