You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/18 12:39:12 UTC

svn commit: r1172242 [2/9] - in /tika/trunk: tika-app/src/test/java/org/apache/tika/cli/ tika-core/src/main/java/org/apache/tika/detect/ tika-core/src/main/java/org/apache/tika/extractor/ tika-core/src/main/java/org/apache/tika/fork/ tika-core/src/main...

Propchange: tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java Sun Sep 18 10:39:08 2011
@@ -1,102 +1,102 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.language;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URISyntaxException;
-
-import junit.framework.Assert;
-import junit.framework.TestCase;
-import org.apache.tika.exception.TikaException;
-
-public class LanguageProfilerBuilderTest extends TestCase {
-    /* Test members */
-    private LanguageProfilerBuilder ngramProfile = null;
-    private LanguageProfile langProfile = null;
-    private final String profileName = "../tika-core/src/test/resources/org/apache/tika/language/langbuilder/"
-            + LanguageProfilerBuilderTest.class.getName();
-    private final String corpusName = "langbuilder/welsh_corpus.txt";
-    private final String encoding = "UTF-8";
-    private final String FILE_EXTENSION = "ngp";
-    private final String LANGUAGE = "welsh";
-    private final int maxlen = 1000;
-
-    public void testCreateProfile() throws TikaException, IOException, URISyntaxException {
-        InputStream is =
-                LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName);
-        try {
-            ngramProfile = LanguageProfilerBuilder.create(profileName, is , encoding);
-        } finally {
-            is.close();
-        }
-
-        File f = new File(profileName + "." + FILE_EXTENSION);
-        FileOutputStream fos = new FileOutputStream(f);
-        ngramProfile.save(fos);
-        fos.close();
-        Assert.assertEquals(maxlen, ngramProfile.getSorted().size());
-    }
-
-    public void testNGramProfile() throws IOException, TikaException, URISyntaxException {
-        createLanguageProfile();
-        LanguageIdentifier.addProfile(LANGUAGE, langProfile);
-        LanguageIdentifier identifier = new LanguageIdentifier(langProfile);
-        Assert.assertEquals(LANGUAGE, identifier.getLanguage());
-        Assert.assertTrue(identifier.isReasonablyCertain());
-    }
-
-    private void createLanguageProfile() throws IOException, TikaException, URISyntaxException {
-        // Sort of dependency injection
-        if (ngramProfile == null)
-            testCreateProfile();
-
-        langProfile = new LanguageProfile();
-
-        InputStream stream = new FileInputStream(new File(profileName + "."
-                + FILE_EXTENSION));
-        try {
-            BufferedReader reader = new BufferedReader(new InputStreamReader(
-                    stream, encoding));
-            String line = reader.readLine();
-            while (line != null) {
-                if (line.length() > 0 && !line.startsWith("#")) {// skips the
-                                                                 // ngp
-                                                                 // header/comment
-                    int space = line.indexOf(' ');
-                    langProfile.add(line.substring(0, space),
-                            Long.parseLong(line.substring(space + 1)));
-                }
-                line = reader.readLine();
-            }
-        } finally {
-            stream.close();
-        }
-    }
-
-    public void tearDown() throws Exception {
-        File profile = new File(profileName + "." + FILE_EXTENSION);
-        if (profile.exists())
-            profile.delete();
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.language;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URISyntaxException;
+
+import junit.framework.Assert;
+import junit.framework.TestCase;
+import org.apache.tika.exception.TikaException;
+
+public class LanguageProfilerBuilderTest extends TestCase {
+    /* Test members */
+    private LanguageProfilerBuilder ngramProfile = null;
+    private LanguageProfile langProfile = null;
+    private final String profileName = "../tika-core/src/test/resources/org/apache/tika/language/langbuilder/"
+            + LanguageProfilerBuilderTest.class.getName();
+    private final String corpusName = "langbuilder/welsh_corpus.txt";
+    private final String encoding = "UTF-8";
+    private final String FILE_EXTENSION = "ngp";
+    private final String LANGUAGE = "welsh";
+    private final int maxlen = 1000;
+
+    public void testCreateProfile() throws TikaException, IOException, URISyntaxException {
+        InputStream is =
+                LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName);
+        try {
+            ngramProfile = LanguageProfilerBuilder.create(profileName, is , encoding);
+        } finally {
+            is.close();
+        }
+
+        File f = new File(profileName + "." + FILE_EXTENSION);
+        FileOutputStream fos = new FileOutputStream(f);
+        ngramProfile.save(fos);
+        fos.close();
+        Assert.assertEquals(maxlen, ngramProfile.getSorted().size());
+    }
+
+    public void testNGramProfile() throws IOException, TikaException, URISyntaxException {
+        createLanguageProfile();
+        LanguageIdentifier.addProfile(LANGUAGE, langProfile);
+        LanguageIdentifier identifier = new LanguageIdentifier(langProfile);
+        Assert.assertEquals(LANGUAGE, identifier.getLanguage());
+        Assert.assertTrue(identifier.isReasonablyCertain());
+    }
+
+    private void createLanguageProfile() throws IOException, TikaException, URISyntaxException {
+        // Sort of dependency injection
+        if (ngramProfile == null)
+            testCreateProfile();
+
+        langProfile = new LanguageProfile();
+
+        InputStream stream = new FileInputStream(new File(profileName + "."
+                + FILE_EXTENSION));
+        try {
+            BufferedReader reader = new BufferedReader(new InputStreamReader(
+                    stream, encoding));
+            String line = reader.readLine();
+            while (line != null) {
+                if (line.length() > 0 && !line.startsWith("#")) {// skips the
+                                                                 // ngp
+                                                                 // header/comment
+                    int space = line.indexOf(' ');
+                    langProfile.add(line.substring(0, space),
+                            Long.parseLong(line.substring(space + 1)));
+                }
+                line = reader.readLine();
+            }
+        } finally {
+            stream.close();
+        }
+    }
+
+    public void tearDown() throws Exception {
+        File profile = new File(profileName + "." + FILE_EXTENSION);
+        if (profile.exists())
+            profile.delete();
+    }
+}

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java Sun Sep 18 10:39:08 2011
@@ -1,57 +1,57 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.sax.TextContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Utility class
- * 
- * 
- */
-public class CHM2XHTML {
-
-    public static void process(CHMDocumentInformation chmDoc,
-            ContentHandler handler) throws TikaException {
-        String text = chmDoc.getText();
-        try {
-            if (text.length() > 0) {
-                handler.characters(text.toCharArray(), 0, text.length());
-                new CHM2XHTML(chmDoc, handler);
-            } else
-                throw new TikaException("Could not extract content");
-
-        } catch (SAXException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    protected String getText(CHMDocumentInformation chmDoc)
-            throws TikaException {
-        return chmDoc.getText();
-    }
-
-    protected TextContentHandler handler;
-
-    public CHM2XHTML(CHMDocumentInformation chmDoc, ContentHandler handler) {
-        this.handler = new TextContentHandler(handler);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.sax.TextContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class
+ * 
+ * 
+ */
+public class CHM2XHTML {
+
+    public static void process(CHMDocumentInformation chmDoc,
+            ContentHandler handler) throws TikaException {
+        String text = chmDoc.getText();
+        try {
+            if (text.length() > 0) {
+                handler.characters(text.toCharArray(), 0, text.length());
+                new CHM2XHTML(chmDoc, handler);
+            } else
+                throw new TikaException("Could not extract content");
+
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    protected String getText(CHMDocumentInformation chmDoc)
+            throws TikaException {
+        return chmDoc.getText();
+    }
+
+    protected TextContentHandler handler;
+
+    public CHM2XHTML(CHMDocumentInformation chmDoc, ContentHandler handler) {
+        this.handler = new TextContentHandler(handler);
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java Sun Sep 18 10:39:08 2011
@@ -1,186 +1,186 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.parser.html.HtmlParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Extracts text and metadata from chm file
- * 
- */
-public class CHMDocumentInformation {
-    /* Class members */
-    private ChmExtractor chmExtractor = null;
-
-    /**
-     * Loads chm file as input stream and returns a new instance of chm doc info
-     * 
-     * @param is
-     *            InputStream
-     * 
-     * @return chm document information
-     * @throws TikaException 
-     * @throws IOException 
-     */
-    public static CHMDocumentInformation load(InputStream is) throws TikaException, IOException {
-        CHMDocumentInformation document = new CHMDocumentInformation();
-        document.setChmExtractor(new ChmExtractor(is));
-        return document;
-    }
-
-    /**
-     * Appends extracted data from chm listing entries
-     * 
-     * @return extracted content of chm
-     */
-    private String getContent() {
-        StringBuilder sb = new StringBuilder();
-        DirectoryListingEntry entry;
-        
-        for (Iterator<DirectoryListingEntry> it = getChmExtractor()
-                .getChmDirList().getDirectoryListingEntryList().iterator(); it.hasNext();) 
-        {
-            try {
-                entry = it.next();
-                if (isRightEntry(entry)) {
-                    byte[][] tmp = getChmExtractor().extractChmEntry(entry);
-                    if (tmp != null) {
-                        sb.append(extract(tmp));
-                    }
-                }
-            } catch (TikaException e) {
-                //ignore
-            } // catch (IOException e) {//Pushback exception from tagsoup
-            // System.err.println(e.getMessage());
-        }
-        return sb.toString();
-    }
-
-    /**
-     * Checks if an entry is a html or not.
-     * 
-     * @param entry
-     *            chm directory listing entry
-     * 
-     * @return boolean
-     */
-    private boolean isRightEntry(DirectoryListingEntry entry) {
-        return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
-    }
-
-    /**
-     * Returns chm extractor
-     * 
-     * @return chmExtractor
-     */
-    private ChmExtractor getChmExtractor() {
-        return chmExtractor;
-    }
-
-    /**
-     * Sets a chm extractor
-     * 
-     * @param chmExtractor
-     */
-    private void setChmExtractor(ChmExtractor chmExtractor) {
-        this.chmExtractor = chmExtractor;
-    }
-
-    /**
-     * Returns chm metadata
-     * 
-     * @param metadata
-     * 
-     * @throws TikaException
-     * @throws IOException
-     */
-    public void getCHMDocInformation(Metadata metadata) throws TikaException,
-            IOException {
-        if (getChmExtractor() != null) {
-            /* Checking if file is a chm, done during creating chmItsf header */
-            metadata.add(Metadata.CONTENT_TYPE, "application/x-chm");
-        } else {
-            metadata.add(Metadata.CONTENT_TYPE, "unknown");
-        }
-    }
-
-    /**
-     * Returns extracted text from chm file
-     * 
-     * @return text
-     * 
-     * @throws TikaException
-     */
-    public String getText() throws TikaException {
-        return getContent();
-    }
-
-    /**
-     * Extracts data from byte[][]
-     * 
-     * @param byteObject
-     * @return
-     * @throws IOException
-     * @throws SAXException
-     */
-    private String extract(byte[][] byteObject) {// throws IOException
-        StringBuilder wBuf = new StringBuilder();
-        InputStream stream = null;
-        Metadata metadata = new Metadata();
-        HtmlParser htmlParser = new HtmlParser();
-        BodyContentHandler handler = new BodyContentHandler(-1);// -1
-        ParseContext parser = new ParseContext();
-        try {
-            for (int i = 0; i < byteObject.length; i++) {
-                stream = new ByteArrayInputStream(byteObject[i]);
-                try {
-                    htmlParser.parse(stream, handler, metadata, parser);
-                } catch (TikaException e) {
-                    wBuf.append(new String(byteObject[i]));
-//                    System.err.println("\n"
-//                            + CHMDocumentInformation.class.getName()
-//                            + " extract " + e.getMessage());
-                } finally {
-                    wBuf.append(handler.toString()
-                            + System.getProperty("line.separator"));
-                    stream.close();
-                }
-            }
-        } catch (SAXException e) {
-            throw new RuntimeException(e);
-        } catch (IOException e) {// 
-        // Pushback overflow from tagsoup
-        }
-        return wBuf.toString();
-    }
-
-    public static void main(String[] args) {
-
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Extracts text and metadata from chm file
+ * 
+ */
+public class CHMDocumentInformation {
+    /* Class members */
+    private ChmExtractor chmExtractor = null;
+
+    /**
+     * Loads chm file as input stream and returns a new instance of chm doc info
+     * 
+     * @param is
+     *            InputStream
+     * 
+     * @return chm document information
+     * @throws TikaException 
+     * @throws IOException 
+     */
+    public static CHMDocumentInformation load(InputStream is) throws TikaException, IOException {
+        CHMDocumentInformation document = new CHMDocumentInformation();
+        document.setChmExtractor(new ChmExtractor(is));
+        return document;
+    }
+
+    /**
+     * Appends extracted data from chm listing entries
+     * 
+     * @return extracted content of chm
+     */
+    private String getContent() {
+        StringBuilder sb = new StringBuilder();
+        DirectoryListingEntry entry;
+        
+        for (Iterator<DirectoryListingEntry> it = getChmExtractor()
+                .getChmDirList().getDirectoryListingEntryList().iterator(); it.hasNext();) 
+        {
+            try {
+                entry = it.next();
+                if (isRightEntry(entry)) {
+                    byte[][] tmp = getChmExtractor().extractChmEntry(entry);
+                    if (tmp != null) {
+                        sb.append(extract(tmp));
+                    }
+                }
+            } catch (TikaException e) {
+                //ignore
+            } // catch (IOException e) {//Pushback exception from tagsoup
+            // System.err.println(e.getMessage());
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Checks if an entry is a html or not.
+     * 
+     * @param entry
+     *            chm directory listing entry
+     * 
+     * @return boolean
+     */
+    private boolean isRightEntry(DirectoryListingEntry entry) {
+        return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
+    }
+
+    /**
+     * Returns chm extractor
+     * 
+     * @return chmExtractor
+     */
+    private ChmExtractor getChmExtractor() {
+        return chmExtractor;
+    }
+
+    /**
+     * Sets a chm extractor
+     * 
+     * @param chmExtractor
+     */
+    private void setChmExtractor(ChmExtractor chmExtractor) {
+        this.chmExtractor = chmExtractor;
+    }
+
+    /**
+     * Returns chm metadata
+     * 
+     * @param metadata
+     * 
+     * @throws TikaException
+     * @throws IOException
+     */
+    public void getCHMDocInformation(Metadata metadata) throws TikaException,
+            IOException {
+        if (getChmExtractor() != null) {
+            /* Checking if file is a chm, done during creating chmItsf header */
+            metadata.add(Metadata.CONTENT_TYPE, "application/x-chm");
+        } else {
+            metadata.add(Metadata.CONTENT_TYPE, "unknown");
+        }
+    }
+
+    /**
+     * Returns extracted text from chm file
+     * 
+     * @return text
+     * 
+     * @throws TikaException
+     */
+    public String getText() throws TikaException {
+        return getContent();
+    }
+
+    /**
+     * Extracts data from byte[][]
+     * 
+     * @param byteObject
+     * @return
+     * @throws IOException
+     * @throws SAXException
+     */
+    private String extract(byte[][] byteObject) {// throws IOException
+        StringBuilder wBuf = new StringBuilder();
+        InputStream stream = null;
+        Metadata metadata = new Metadata();
+        HtmlParser htmlParser = new HtmlParser();
+        BodyContentHandler handler = new BodyContentHandler(-1);// -1
+        ParseContext parser = new ParseContext();
+        try {
+            for (int i = 0; i < byteObject.length; i++) {
+                stream = new ByteArrayInputStream(byteObject[i]);
+                try {
+                    htmlParser.parse(stream, handler, metadata, parser);
+                } catch (TikaException e) {
+                    wBuf.append(new String(byteObject[i]));
+//                    System.err.println("\n"
+//                            + CHMDocumentInformation.class.getName()
+//                            + " extract " + e.getMessage());
+                } finally {
+                    wBuf.append(handler.toString()
+                            + System.getProperty("line.separator"));
+                    stream.close();
+                }
+            }
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        } catch (IOException e) {// 
+        // Pushback overflow from tagsoup
+        }
+        return wBuf.toString();
+    }
+
+    public static void main(String[] args) {
+
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java Sun Sep 18 10:39:08 2011
@@ -1,56 +1,56 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ChmParser extends AbstractParser {
-
-    private static final long serialVersionUID = 5938777307516469802L;
-    private static final Set<MediaType> SUPPORTED_TYPES = Collections
-            .singleton(MediaType.application("chm"));
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context) throws IOException,
-            SAXException, TikaException {
-        CHMDocumentInformation chmInfo = CHMDocumentInformation.load(stream);
-        metadata.set(Metadata.CONTENT_TYPE, "chm");
-        extractMetadata(chmInfo, metadata);
-        CHM2XHTML.process(chmInfo, handler);
-    }
-
-    private void extractMetadata(CHMDocumentInformation chmInfo,
-            Metadata metadata) throws TikaException, IOException {
-        chmInfo.getCHMDocInformation(metadata);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ChmParser extends AbstractParser {
+
+    private static final long serialVersionUID = 5938777307516469802L;
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.application("chm"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        CHMDocumentInformation chmInfo = CHMDocumentInformation.load(stream);
+        metadata.set(Metadata.CONTENT_TYPE, "chm");
+        extractMetadata(chmInfo, metadata);
+        CHM2XHTML.process(chmInfo, handler);
+    }
+
+    private void extractMetadata(CHMDocumentInformation chmInfo,
+            Metadata metadata) throws TikaException, IOException {
+        chmInfo.getCHMDocInformation(metadata);
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Sun Sep 18 10:39:08 2011
@@ -1,39 +1,39 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.io.Serializable;
-
-import org.apache.tika.exception.TikaException;
-
-/**
- * 
- * Defines an accessor interface
- * 
- * @param <T>
- */
-public interface ChmAccessor<T> extends Serializable {
-    /**
-     * Parses chm accessor
-     * 
-     * @param data
-     *            chm file
-     * @param chmAccessor
-     * @throws TikaException 
-     */
-    void parse(byte[] data, T chmAccessor) throws TikaException;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * 
+ * Defines an accessor interface
+ * 
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+    /**
+     * Parses chm accessor
+     * 
+     * @param data
+     *            chm file
+     * @param chmAccessor
+     * @throws TikaException 
+     */
+    void parse(byte[] data, T chmAccessor) throws TikaException;
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Sun Sep 18 10:39:08 2011
@@ -1,397 +1,397 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-
-/**
- * Holds chm listing entries
- */
-public class ChmDirectoryListingSet {
-    private List<DirectoryListingEntry> dlel;
-    private byte[] data;
-    private int placeHolder = -1;
-    private long dataOffset = -1;
-    private int controlDataIndex = -1;
-    private int resetTableIndex = -1;
-
-    private boolean isNotControlDataFound = true;
-    private boolean isNotResetTableFound = true;
-
-    /**
-     * Constructs chm directory listing set
-     * 
-     * @param data
-     *            byte[]
-     * @param chmItsHeader
-     * @param chmItspHeader
-     * @throws TikaException 
-     */
-    public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
-            ChmItspHeader chmItspHeader) throws TikaException {
-        setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
-        ChmCommons.assertByteArrayNotNull(data);
-        setData(data);
-        enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
-    }
-
-    public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("list:=" + getDirectoryListingEntryList().toString()
-                + System.getProperty("line.separator"));
-        sb.append("number of list items:="
-                + getDirectoryListingEntryList().size());
-        return sb.toString();
-    }
-
-    /**
-     * Returns control data index that located in List
-     * 
-     * @return control data index
-     */
-    public int getControlDataIndex() {
-        return controlDataIndex;
-    }
-
-    /**
-     * Sets control data index
-     * 
-     * @param controlDataIndex
-     */
-    protected void setControlDataIndex(int controlDataIndex) {
-        this.controlDataIndex = controlDataIndex;
-    }
-
-    /**
-     * Return index of reset table
-     * 
-     * @return reset table index
-     */
-    public int getResetTableIndex() {
-        return resetTableIndex;
-    }
-
-    /**
-     * Sets reset table index
-     * 
-     * @param resetTableIndex
-     */
-    protected void setResetTableIndex(int resetTableIndex) {
-        this.resetTableIndex = resetTableIndex;
-    }
-
-    /**
-     * Gets place holder
-     * 
-     * @return place holder
-     */
-    private int getPlaceHolder() {
-        return placeHolder;
-    }
-
-    /**
-     * Sets place holder
-     * 
-     * @param placeHolder
-     */
-    private void setPlaceHolder(int placeHolder) {
-        this.placeHolder = placeHolder;
-    }
-
-    /**
-     * Enumerates chm directory listing entries
-     * 
-     * @param chmItsHeader
-     *            chm itsf header
-     * @param chmItspHeader
-     *            chm itsp header
-     */
-    private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
-            ChmItspHeader chmItspHeader) {
-        try {
-            int startPmgl = chmItspHeader.getIndex_head();
-            int stopPmgl = chmItspHeader.getUnknown_0024();
-            int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
-                    .getHeader_len());
-            setDataOffset(chmItsHeader.getDataOffset());
-
-            /* loops over all pmgls */
-            int previous_index = 0;
-            byte[] dir_chunk = null;
-            for (int i = startPmgl; i <= stopPmgl; i++) {
-                int data_copied = ((1 + i) * (int) chmItspHeader.getBlock_len())
-                        + dir_offset;
-                if (i == 0) {
-                    dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
-                    // dir_chunk = Arrays.copyOfRange(getData(), dir_offset,
-                    // (((1+i) * (int)chmItspHeader.getBlock_len()) +
-                    // dir_offset));
-                    dir_chunk = ChmCommons
-                            .copyOfRange(getData(), dir_offset,
-                                    (((1 + i) * (int) chmItspHeader
-                                            .getBlock_len()) + dir_offset));
-                    previous_index = data_copied;
-                } else {
-                    dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
-                    // dir_chunk = Arrays.copyOfRange(getData(), previous_index,
-                    // (((1+i) * (int)chmItspHeader.getBlock_len()) +
-                    // dir_offset));
-                    dir_chunk = ChmCommons
-                            .copyOfRange(getData(), previous_index,
-                                    (((1 + i) * (int) chmItspHeader
-                                            .getBlock_len()) + dir_offset));
-                    previous_index = data_copied;
-                }
-                enumerateOneSegment(dir_chunk);
-                dir_chunk = null;
-            }
-        } catch (Exception e) {
-            e.printStackTrace();
-        } finally {
-            setData(null);
-        }
-    }
-
-    /**
-     * Checks control data
-     * 
-     * @param dle
-     *            chm directory listing entry
-     */
-    private void checkControlData(DirectoryListingEntry dle) {
-        if (isNotControlDataFound) {
-            if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
-                setControlDataIndex(getDirectoryListingEntryList().size());
-                isNotControlDataFound = false;
-            }
-        }
-    }
-
-    /**
-     * Checks reset table
-     * 
-     * @param dle
-     *            chm directory listing entry
-     */
-    private void checkResetTable(DirectoryListingEntry dle) {
-        if (isNotResetTableFound) {
-            if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
-                setResetTableIndex(getDirectoryListingEntryList().size());
-                isNotResetTableFound = false;
-            }
-        }
-    }
-
-    /**
-     * Enumerates chm directory listing entries in single chm segment
-     * 
-     * @param dir_chunk
-     */
-    private void enumerateOneSegment(byte[] dir_chunk) {
-        try {
-            if (dir_chunk != null) {
-
-                int indexWorkData = ChmCommons.indexOf(dir_chunk,
-                        "::".getBytes());
-                int indexUserData = ChmCommons.indexOf(dir_chunk,
-                        "/".getBytes());
-
-                if (indexUserData < indexWorkData)
-                    setPlaceHolder(indexUserData);
-                else
-                    setPlaceHolder(indexWorkData);
-
-                if (getPlaceHolder() > 0
-                        && dir_chunk[getPlaceHolder() - 1] != 115) {// #{
-                    do {
-                        if (dir_chunk[getPlaceHolder() - 1] > 0) {
-                            DirectoryListingEntry dle = new DirectoryListingEntry();
-
-                            // two cases: 1. when dir_chunk[getPlaceHolder() -
-                            // 1] == 0x73
-                            // 2. when dir_chunk[getPlaceHolder() + 1] == 0x2f
-                            doNameCheck(dir_chunk, dle);
-
-                            // dle.setName(new
-                            // String(Arrays.copyOfRange(dir_chunk,
-                            // getPlaceHolder(), (getPlaceHolder() +
-                            // dle.getNameLength()))));
-                            dle.setName(new String(ChmCommons.copyOfRange(
-                                    dir_chunk, getPlaceHolder(),
-                                    (getPlaceHolder() + dle.getNameLength()))));
-                            checkControlData(dle);
-                            checkResetTable(dle);
-                            setPlaceHolder(getPlaceHolder()
-                                    + dle.getNameLength());
-
-                            /* Sets entry type */
-                            if (getPlaceHolder() < dir_chunk.length
-                                    && dir_chunk[getPlaceHolder()] == 0)
-                                dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
-                            else
-                                dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
-
-                            setPlaceHolder(getPlaceHolder() + 1);
-                            dle.setOffset(getEncint(dir_chunk));
-                            dle.setLength(getEncint(dir_chunk));
-                            getDirectoryListingEntryList().add(dle);
-                        } else
-                            setPlaceHolder(getPlaceHolder() + 1);
-
-                    } while (hasNext(dir_chunk));
-                }
-            }
-
-        } catch (Exception e) {
-            e.printStackTrace();
-        }
-    }
-
-    /**
-     * Checks if a name and name length are correct. If not then handles it as
-     * follows: 1. when dir_chunk[getPlaceHolder() - 1] == 0x73 ('/') 2. when
-     * dir_chunk[getPlaceHolder() + 1] == 0x2f ('s')
-     * 
-     * @param dir_chunk
-     * @param dle
-     */
-    private void doNameCheck(byte[] dir_chunk, DirectoryListingEntry dle) {
-        if (dir_chunk[getPlaceHolder() - 1] == 0x73) {
-            dle.setNameLength(dir_chunk[getPlaceHolder() - 1] & 0x21);
-        } else if (dir_chunk[getPlaceHolder() + 1] == 0x2f) {
-            dle.setNameLength(dir_chunk[getPlaceHolder()]);
-            setPlaceHolder(getPlaceHolder() + 1);
-        } else {
-            dle.setNameLength(dir_chunk[getPlaceHolder() - 1]);
-        }
-    }
-
-    /**
-     * Checks if it's possible move further on byte[]
-     * 
-     * @param dir_chunk
-     * 
-     * @return boolean
-     */
-    private boolean hasNext(byte[] dir_chunk) {
-        while (getPlaceHolder() < dir_chunk.length) {
-            if (dir_chunk[getPlaceHolder()] == 47
-                    && dir_chunk[getPlaceHolder() + 1] != ':') {
-                setPlaceHolder(getPlaceHolder());
-                return true;
-            } else if (dir_chunk[getPlaceHolder()] == ':'
-                    && dir_chunk[getPlaceHolder() + 1] == ':') {
-                setPlaceHolder(getPlaceHolder());
-                return true;
-            } else
-                setPlaceHolder(getPlaceHolder() + 1);
-        }
-        return false;
-    }
-
-    /**
-     * Returns encrypted integer
-     * 
-     * @param data_chunk
-     * 
-     * @return
-     */
-    private int getEncint(byte[] data_chunk) {
-        byte ob;
-        BigInteger bi = BigInteger.ZERO;
-        byte[] nb = new byte[1];
-
-        if (getPlaceHolder() < data_chunk.length) {
-            while ((ob = data_chunk[getPlaceHolder()]) < 0) {
-                nb[0] = (byte) ((ob & 0x7f));
-                bi = bi.shiftLeft(7).add(new BigInteger(nb));
-                setPlaceHolder(getPlaceHolder() + 1);
-            }
-            nb[0] = (byte) ((ob & 0x7f));
-            bi = bi.shiftLeft(7).add(new BigInteger(nb));
-            setPlaceHolder(getPlaceHolder() + 1);
-        }
-        return bi.intValue();
-    }
-
-    /**
-     * @param args
-     */
-    public static void main(String[] args) {
-    }
-
-    /**
-     * Sets chm directory listing entry list
-     * 
-     * @param dlel
-     *            chm directory listing entry list
-     */
-    public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
-        this.dlel = dlel;
-    }
-
-    /**
-     * Returns chm directory listing entry list
-     * 
-     * @return List<DirectoryListingEntry>
-     */
-    public List<DirectoryListingEntry> getDirectoryListingEntryList() {
-        return dlel;
-    }
-
-    /**
-     * Sets data
-     * 
-     * @param data
-     */
-    private void setData(byte[] data) {
-        this.data = data;
-    }
-
-    /**
-     * Returns data
-     * 
-     * @return
-     */
-    private byte[] getData() {
-        return data;
-    }
-
-    /**
-     * Sets data offset
-     * 
-     * @param dataOffset
-     */
-    private void setDataOffset(long dataOffset) {
-        this.dataOffset = dataOffset;
-    }
-
-    /**
-     * Returns data offset
-     * 
-     * @return dataOffset
-     */
-    public long getDataOffset() {
-        return dataOffset;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+    private List<DirectoryListingEntry> dlel;
+    private byte[] data;
+    private int placeHolder = -1;
+    private long dataOffset = -1;
+    private int controlDataIndex = -1;
+    private int resetTableIndex = -1;
+
+    private boolean isNotControlDataFound = true;
+    private boolean isNotResetTableFound = true;
+
+    /**
+     * Constructs chm directory listing set
+     * 
+     * @param data
+     *            byte[]
+     * @param chmItsHeader
+     * @param chmItspHeader
+     * @throws TikaException 
+     */
+    public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+            ChmItspHeader chmItspHeader) throws TikaException {
+        setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+        ChmCommons.assertByteArrayNotNull(data);
+        setData(data);
+        enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+    }
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("list:=" + getDirectoryListingEntryList().toString()
+                + System.getProperty("line.separator"));
+        sb.append("number of list items:="
+                + getDirectoryListingEntryList().size());
+        return sb.toString();
+    }
+
+    /**
+     * Returns control data index that located in List
+     * 
+     * @return control data index
+     */
+    public int getControlDataIndex() {
+        return controlDataIndex;
+    }
+
+    /**
+     * Sets control data index
+     * 
+     * @param controlDataIndex
+     */
+    protected void setControlDataIndex(int controlDataIndex) {
+        this.controlDataIndex = controlDataIndex;
+    }
+
+    /**
+     * Return index of reset table
+     * 
+     * @return reset table index
+     */
+    public int getResetTableIndex() {
+        return resetTableIndex;
+    }
+
+    /**
+     * Sets reset table index
+     * 
+     * @param resetTableIndex
+     */
+    protected void setResetTableIndex(int resetTableIndex) {
+        this.resetTableIndex = resetTableIndex;
+    }
+
+    /**
+     * Gets place holder
+     * 
+     * @return place holder
+     */
+    private int getPlaceHolder() {
+        return placeHolder;
+    }
+
+    /**
+     * Sets place holder
+     * 
+     * @param placeHolder
+     */
+    private void setPlaceHolder(int placeHolder) {
+        this.placeHolder = placeHolder;
+    }
+
+    /**
+     * Enumerates chm directory listing entries
+     * 
+     * @param chmItsHeader
+     *            chm itsf header
+     * @param chmItspHeader
+     *            chm itsp header
+     */
+    private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+            ChmItspHeader chmItspHeader) {
+        try {
+            int startPmgl = chmItspHeader.getIndex_head();
+            int stopPmgl = chmItspHeader.getUnknown_0024();
+            int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+                    .getHeader_len());
+            setDataOffset(chmItsHeader.getDataOffset());
+
+            /* loops over all pmgls */
+            int previous_index = 0;
+            byte[] dir_chunk = null;
+            for (int i = startPmgl; i <= stopPmgl; i++) {
+                int data_copied = ((1 + i) * (int) chmItspHeader.getBlock_len())
+                        + dir_offset;
+                if (i == 0) {
+                    dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+                    // dir_chunk = Arrays.copyOfRange(getData(), dir_offset,
+                    // (((1+i) * (int)chmItspHeader.getBlock_len()) +
+                    // dir_offset));
+                    dir_chunk = ChmCommons
+                            .copyOfRange(getData(), dir_offset,
+                                    (((1 + i) * (int) chmItspHeader
+                                            .getBlock_len()) + dir_offset));
+                    previous_index = data_copied;
+                } else {
+                    dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+                    // dir_chunk = Arrays.copyOfRange(getData(), previous_index,
+                    // (((1+i) * (int)chmItspHeader.getBlock_len()) +
+                    // dir_offset));
+                    dir_chunk = ChmCommons
+                            .copyOfRange(getData(), previous_index,
+                                    (((1 + i) * (int) chmItspHeader
+                                            .getBlock_len()) + dir_offset));
+                    previous_index = data_copied;
+                }
+                enumerateOneSegment(dir_chunk);
+                dir_chunk = null;
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        } finally {
+            setData(null);
+        }
+    }
+
+    /**
+     * Checks control data
+     * 
+     * @param dle
+     *            chm directory listing entry
+     */
+    private void checkControlData(DirectoryListingEntry dle) {
+        if (isNotControlDataFound) {
+            if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+                setControlDataIndex(getDirectoryListingEntryList().size());
+                isNotControlDataFound = false;
+            }
+        }
+    }
+
+    /**
+     * Checks reset table
+     * 
+     * @param dle
+     *            chm directory listing entry
+     */
+    private void checkResetTable(DirectoryListingEntry dle) {
+        if (isNotResetTableFound) {
+            if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+                setResetTableIndex(getDirectoryListingEntryList().size());
+                isNotResetTableFound = false;
+            }
+        }
+    }
+
+    /**
+     * Enumerates chm directory listing entries in single chm segment
+     * 
+     * @param dir_chunk
+     */
+    private void enumerateOneSegment(byte[] dir_chunk) {
+        try {
+            if (dir_chunk != null) {
+
+                int indexWorkData = ChmCommons.indexOf(dir_chunk,
+                        "::".getBytes());
+                int indexUserData = ChmCommons.indexOf(dir_chunk,
+                        "/".getBytes());
+
+                if (indexUserData < indexWorkData)
+                    setPlaceHolder(indexUserData);
+                else
+                    setPlaceHolder(indexWorkData);
+
+                if (getPlaceHolder() > 0
+                        && dir_chunk[getPlaceHolder() - 1] != 115) {// #{
+                    do {
+                        if (dir_chunk[getPlaceHolder() - 1] > 0) {
+                            DirectoryListingEntry dle = new DirectoryListingEntry();
+
+                            // two cases: 1. when dir_chunk[getPlaceHolder() -
+                            // 1] == 0x73
+                            // 2. when dir_chunk[getPlaceHolder() + 1] == 0x2f
+                            doNameCheck(dir_chunk, dle);
+
+                            // dle.setName(new
+                            // String(Arrays.copyOfRange(dir_chunk,
+                            // getPlaceHolder(), (getPlaceHolder() +
+                            // dle.getNameLength()))));
+                            dle.setName(new String(ChmCommons.copyOfRange(
+                                    dir_chunk, getPlaceHolder(),
+                                    (getPlaceHolder() + dle.getNameLength()))));
+                            checkControlData(dle);
+                            checkResetTable(dle);
+                            setPlaceHolder(getPlaceHolder()
+                                    + dle.getNameLength());
+
+                            /* Sets entry type */
+                            if (getPlaceHolder() < dir_chunk.length
+                                    && dir_chunk[getPlaceHolder()] == 0)
+                                dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+                            else
+                                dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+                            setPlaceHolder(getPlaceHolder() + 1);
+                            dle.setOffset(getEncint(dir_chunk));
+                            dle.setLength(getEncint(dir_chunk));
+                            getDirectoryListingEntryList().add(dle);
+                        } else
+                            setPlaceHolder(getPlaceHolder() + 1);
+
+                    } while (hasNext(dir_chunk));
+                }
+            }
+
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    /**
+     * Checks if a name and name length are correct. If not then handles it as
+     * follows: 1. when dir_chunk[getPlaceHolder() - 1] == 0x73 ('/') 2. when
+     * dir_chunk[getPlaceHolder() + 1] == 0x2f ('s')
+     * 
+     * @param dir_chunk
+     * @param dle
+     */
+    private void doNameCheck(byte[] dir_chunk, DirectoryListingEntry dle) {
+        if (dir_chunk[getPlaceHolder() - 1] == 0x73) {
+            dle.setNameLength(dir_chunk[getPlaceHolder() - 1] & 0x21);
+        } else if (dir_chunk[getPlaceHolder() + 1] == 0x2f) {
+            dle.setNameLength(dir_chunk[getPlaceHolder()]);
+            setPlaceHolder(getPlaceHolder() + 1);
+        } else {
+            dle.setNameLength(dir_chunk[getPlaceHolder() - 1]);
+        }
+    }
+
+    /**
+     * Checks if it's possible move further on byte[]
+     * 
+     * @param dir_chunk
+     * 
+     * @return boolean
+     */
+    private boolean hasNext(byte[] dir_chunk) {
+        while (getPlaceHolder() < dir_chunk.length) {
+            if (dir_chunk[getPlaceHolder()] == 47
+                    && dir_chunk[getPlaceHolder() + 1] != ':') {
+                setPlaceHolder(getPlaceHolder());
+                return true;
+            } else if (dir_chunk[getPlaceHolder()] == ':'
+                    && dir_chunk[getPlaceHolder() + 1] == ':') {
+                setPlaceHolder(getPlaceHolder());
+                return true;
+            } else
+                setPlaceHolder(getPlaceHolder() + 1);
+        }
+        return false;
+    }
+
+    /**
+     * Returns encrypted integer
+     * 
+     * @param data_chunk
+     * 
+     * @return
+     */
+    private int getEncint(byte[] data_chunk) {
+        byte ob;
+        BigInteger bi = BigInteger.ZERO;
+        byte[] nb = new byte[1];
+
+        if (getPlaceHolder() < data_chunk.length) {
+            while ((ob = data_chunk[getPlaceHolder()]) < 0) {
+                nb[0] = (byte) ((ob & 0x7f));
+                bi = bi.shiftLeft(7).add(new BigInteger(nb));
+                setPlaceHolder(getPlaceHolder() + 1);
+            }
+            nb[0] = (byte) ((ob & 0x7f));
+            bi = bi.shiftLeft(7).add(new BigInteger(nb));
+            setPlaceHolder(getPlaceHolder() + 1);
+        }
+        return bi.intValue();
+    }
+
+    /**
+     * @param args
+     */
+    public static void main(String[] args) {
+    }
+
+    /**
+     * Sets chm directory listing entry list
+     * 
+     * @param dlel
+     *            chm directory listing entry list
+     */
+    public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+        this.dlel = dlel;
+    }
+
+    /**
+     * Returns chm directory listing entry list
+     * 
+     * @return List<DirectoryListingEntry>
+     */
+    public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+        return dlel;
+    }
+
+    /**
+     * Sets data
+     * 
+     * @param data
+     */
+    private void setData(byte[] data) {
+        this.data = data;
+    }
+
+    /**
+     * Returns data
+     * 
+     * @return
+     */
+    private byte[] getData() {
+        return data;
+    }
+
+    /**
+     * Sets data offset
+     * 
+     * @param dataOffset
+     */
+    private void setDataOffset(long dataOffset) {
+        this.dataOffset = dataOffset;
+    }
+
+    /**
+     * Returns data offset
+     * 
+     * @return dataOffset
+     */
+    public long getDataOffset() {
+        return dataOffset;
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
------------------------------------------------------------------------------
    svn:eol-style = native