You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/18 12:39:12 UTC
svn commit: r1172242 [2/9] - in /tika/trunk:
tika-app/src/test/java/org/apache/tika/cli/
tika-core/src/main/java/org/apache/tika/detect/
tika-core/src/main/java/org/apache/tika/extractor/
tika-core/src/main/java/org/apache/tika/fork/ tika-core/src/main...
Propchange: tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java Sun Sep 18 10:39:08 2011
@@ -1,102 +1,102 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.language;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URISyntaxException;
-
-import junit.framework.Assert;
-import junit.framework.TestCase;
-import org.apache.tika.exception.TikaException;
-
-public class LanguageProfilerBuilderTest extends TestCase {
- /* Test members */
- private LanguageProfilerBuilder ngramProfile = null;
- private LanguageProfile langProfile = null;
- private final String profileName = "../tika-core/src/test/resources/org/apache/tika/language/langbuilder/"
- + LanguageProfilerBuilderTest.class.getName();
- private final String corpusName = "langbuilder/welsh_corpus.txt";
- private final String encoding = "UTF-8";
- private final String FILE_EXTENSION = "ngp";
- private final String LANGUAGE = "welsh";
- private final int maxlen = 1000;
-
- public void testCreateProfile() throws TikaException, IOException, URISyntaxException {
- InputStream is =
- LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName);
- try {
- ngramProfile = LanguageProfilerBuilder.create(profileName, is , encoding);
- } finally {
- is.close();
- }
-
- File f = new File(profileName + "." + FILE_EXTENSION);
- FileOutputStream fos = new FileOutputStream(f);
- ngramProfile.save(fos);
- fos.close();
- Assert.assertEquals(maxlen, ngramProfile.getSorted().size());
- }
-
- public void testNGramProfile() throws IOException, TikaException, URISyntaxException {
- createLanguageProfile();
- LanguageIdentifier.addProfile(LANGUAGE, langProfile);
- LanguageIdentifier identifier = new LanguageIdentifier(langProfile);
- Assert.assertEquals(LANGUAGE, identifier.getLanguage());
- Assert.assertTrue(identifier.isReasonablyCertain());
- }
-
- private void createLanguageProfile() throws IOException, TikaException, URISyntaxException {
- // Sort of dependency injection
- if (ngramProfile == null)
- testCreateProfile();
-
- langProfile = new LanguageProfile();
-
- InputStream stream = new FileInputStream(new File(profileName + "."
- + FILE_EXTENSION));
- try {
- BufferedReader reader = new BufferedReader(new InputStreamReader(
- stream, encoding));
- String line = reader.readLine();
- while (line != null) {
- if (line.length() > 0 && !line.startsWith("#")) {// skips the
- // ngp
- // header/comment
- int space = line.indexOf(' ');
- langProfile.add(line.substring(0, space),
- Long.parseLong(line.substring(space + 1)));
- }
- line = reader.readLine();
- }
- } finally {
- stream.close();
- }
- }
-
- public void tearDown() throws Exception {
- File profile = new File(profileName + "." + FILE_EXTENSION);
- if (profile.exists())
- profile.delete();
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.language;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URISyntaxException;
+
+import junit.framework.Assert;
+import junit.framework.TestCase;
+import org.apache.tika.exception.TikaException;
+
+public class LanguageProfilerBuilderTest extends TestCase {
+ /* Test members */
+ private LanguageProfilerBuilder ngramProfile = null;
+ private LanguageProfile langProfile = null;
+ private final String profileName = "../tika-core/src/test/resources/org/apache/tika/language/langbuilder/"
+ + LanguageProfilerBuilderTest.class.getName();
+ private final String corpusName = "langbuilder/welsh_corpus.txt";
+ private final String encoding = "UTF-8";
+ private final String FILE_EXTENSION = "ngp";
+ private final String LANGUAGE = "welsh";
+ private final int maxlen = 1000;
+
+ public void testCreateProfile() throws TikaException, IOException, URISyntaxException {
+ InputStream is =
+ LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName);
+ try {
+ ngramProfile = LanguageProfilerBuilder.create(profileName, is , encoding);
+ } finally {
+ is.close();
+ }
+
+ File f = new File(profileName + "." + FILE_EXTENSION);
+ FileOutputStream fos = new FileOutputStream(f);
+ ngramProfile.save(fos);
+ fos.close();
+ Assert.assertEquals(maxlen, ngramProfile.getSorted().size());
+ }
+
+ public void testNGramProfile() throws IOException, TikaException, URISyntaxException {
+ createLanguageProfile();
+ LanguageIdentifier.addProfile(LANGUAGE, langProfile);
+ LanguageIdentifier identifier = new LanguageIdentifier(langProfile);
+ Assert.assertEquals(LANGUAGE, identifier.getLanguage());
+ Assert.assertTrue(identifier.isReasonablyCertain());
+ }
+
+ private void createLanguageProfile() throws IOException, TikaException, URISyntaxException {
+ // Sort of dependency injection
+ if (ngramProfile == null)
+ testCreateProfile();
+
+ langProfile = new LanguageProfile();
+
+ InputStream stream = new FileInputStream(new File(profileName + "."
+ + FILE_EXTENSION));
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ stream, encoding));
+ String line = reader.readLine();
+ while (line != null) {
+ if (line.length() > 0 && !line.startsWith("#")) {// skips the
+ // ngp
+ // header/comment
+ int space = line.indexOf(' ');
+ langProfile.add(line.substring(0, space),
+ Long.parseLong(line.substring(space + 1)));
+ }
+ line = reader.readLine();
+ }
+ } finally {
+ stream.close();
+ }
+ }
+
+ public void tearDown() throws Exception {
+ File profile = new File(profileName + "." + FILE_EXTENSION);
+ if (profile.exists())
+ profile.delete();
+ }
+}
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java Sun Sep 18 10:39:08 2011
@@ -1,57 +1,57 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.sax.TextContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Utility class
- *
- *
- */
-public class CHM2XHTML {
-
- public static void process(CHMDocumentInformation chmDoc,
- ContentHandler handler) throws TikaException {
- String text = chmDoc.getText();
- try {
- if (text.length() > 0) {
- handler.characters(text.toCharArray(), 0, text.length());
- new CHM2XHTML(chmDoc, handler);
- } else
- throw new TikaException("Could not extract content");
-
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- protected String getText(CHMDocumentInformation chmDoc)
- throws TikaException {
- return chmDoc.getText();
- }
-
- protected TextContentHandler handler;
-
- public CHM2XHTML(CHMDocumentInformation chmDoc, ContentHandler handler) {
- this.handler = new TextContentHandler(handler);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.sax.TextContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class
+ *
+ *
+ */
+public class CHM2XHTML {
+
+ public static void process(CHMDocumentInformation chmDoc,
+ ContentHandler handler) throws TikaException {
+ String text = chmDoc.getText();
+ try {
+ if (text.length() > 0) {
+ handler.characters(text.toCharArray(), 0, text.length());
+ new CHM2XHTML(chmDoc, handler);
+ } else
+ throw new TikaException("Could not extract content");
+
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ protected String getText(CHMDocumentInformation chmDoc)
+ throws TikaException {
+ return chmDoc.getText();
+ }
+
+ protected TextContentHandler handler;
+
+ public CHM2XHTML(CHMDocumentInformation chmDoc, ContentHandler handler) {
+ this.handler = new TextContentHandler(handler);
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java Sun Sep 18 10:39:08 2011
@@ -1,186 +1,186 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.core.ChmExtractor;
-import org.apache.tika.parser.html.HtmlParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Extracts text and metadata from chm file
- *
- */
-public class CHMDocumentInformation {
- /* Class members */
- private ChmExtractor chmExtractor = null;
-
- /**
- * Loads chm file as input stream and returns a new instance of chm doc info
- *
- * @param is
- * InputStream
- *
- * @return chm document information
- * @throws TikaException
- * @throws IOException
- */
- public static CHMDocumentInformation load(InputStream is) throws TikaException, IOException {
- CHMDocumentInformation document = new CHMDocumentInformation();
- document.setChmExtractor(new ChmExtractor(is));
- return document;
- }
-
- /**
- * Appends extracted data from chm listing entries
- *
- * @return extracted content of chm
- */
- private String getContent() {
- StringBuilder sb = new StringBuilder();
- DirectoryListingEntry entry;
-
- for (Iterator<DirectoryListingEntry> it = getChmExtractor()
- .getChmDirList().getDirectoryListingEntryList().iterator(); it.hasNext();)
- {
- try {
- entry = it.next();
- if (isRightEntry(entry)) {
- byte[][] tmp = getChmExtractor().extractChmEntry(entry);
- if (tmp != null) {
- sb.append(extract(tmp));
- }
- }
- } catch (TikaException e) {
- //ignore
- } // catch (IOException e) {//Pushback exception from tagsoup
- // System.err.println(e.getMessage());
- }
- return sb.toString();
- }
-
- /**
- * Checks if an entry is a html or not.
- *
- * @param entry
- * chm directory listing entry
- *
- * @return boolean
- */
- private boolean isRightEntry(DirectoryListingEntry entry) {
- return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
- }
-
- /**
- * Returns chm extractor
- *
- * @return chmExtractor
- */
- private ChmExtractor getChmExtractor() {
- return chmExtractor;
- }
-
- /**
- * Sets a chm extractor
- *
- * @param chmExtractor
- */
- private void setChmExtractor(ChmExtractor chmExtractor) {
- this.chmExtractor = chmExtractor;
- }
-
- /**
- * Returns chm metadata
- *
- * @param metadata
- *
- * @throws TikaException
- * @throws IOException
- */
- public void getCHMDocInformation(Metadata metadata) throws TikaException,
- IOException {
- if (getChmExtractor() != null) {
- /* Checking if file is a chm, done during creating chmItsf header */
- metadata.add(Metadata.CONTENT_TYPE, "application/x-chm");
- } else {
- metadata.add(Metadata.CONTENT_TYPE, "unknown");
- }
- }
-
- /**
- * Returns extracted text from chm file
- *
- * @return text
- *
- * @throws TikaException
- */
- public String getText() throws TikaException {
- return getContent();
- }
-
- /**
- * Extracts data from byte[][]
- *
- * @param byteObject
- * @return
- * @throws IOException
- * @throws SAXException
- */
- private String extract(byte[][] byteObject) {// throws IOException
- StringBuilder wBuf = new StringBuilder();
- InputStream stream = null;
- Metadata metadata = new Metadata();
- HtmlParser htmlParser = new HtmlParser();
- BodyContentHandler handler = new BodyContentHandler(-1);// -1
- ParseContext parser = new ParseContext();
- try {
- for (int i = 0; i < byteObject.length; i++) {
- stream = new ByteArrayInputStream(byteObject[i]);
- try {
- htmlParser.parse(stream, handler, metadata, parser);
- } catch (TikaException e) {
- wBuf.append(new String(byteObject[i]));
-// System.err.println("\n"
-// + CHMDocumentInformation.class.getName()
-// + " extract " + e.getMessage());
- } finally {
- wBuf.append(handler.toString()
- + System.getProperty("line.separator"));
- stream.close();
- }
- }
- } catch (SAXException e) {
- throw new RuntimeException(e);
- } catch (IOException e) {//
- // Pushback overflow from tagsoup
- }
- return wBuf.toString();
- }
-
- public static void main(String[] args) {
-
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Extracts text and metadata from chm file
+ *
+ */
+public class CHMDocumentInformation {
+ /* Class members */
+ private ChmExtractor chmExtractor = null;
+
+ /**
+ * Loads chm file as input stream and returns a new instance of chm doc info
+ *
+ * @param is
+ * InputStream
+ *
+ * @return chm document information
+ * @throws TikaException
+ * @throws IOException
+ */
+ public static CHMDocumentInformation load(InputStream is) throws TikaException, IOException {
+ CHMDocumentInformation document = new CHMDocumentInformation();
+ document.setChmExtractor(new ChmExtractor(is));
+ return document;
+ }
+
+ /**
+ * Appends extracted data from chm listing entries
+ *
+ * @return extracted content of chm
+ */
+ private String getContent() {
+ StringBuilder sb = new StringBuilder();
+ DirectoryListingEntry entry;
+
+ for (Iterator<DirectoryListingEntry> it = getChmExtractor()
+ .getChmDirList().getDirectoryListingEntryList().iterator(); it.hasNext();)
+ {
+ try {
+ entry = it.next();
+ if (isRightEntry(entry)) {
+ byte[][] tmp = getChmExtractor().extractChmEntry(entry);
+ if (tmp != null) {
+ sb.append(extract(tmp));
+ }
+ }
+ } catch (TikaException e) {
+ //ignore
+ } // catch (IOException e) {//Pushback exception from tagsoup
+ // System.err.println(e.getMessage());
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Checks if an entry is a html or not.
+ *
+ * @param entry
+ * chm directory listing entry
+ *
+ * @return boolean
+ */
+ private boolean isRightEntry(DirectoryListingEntry entry) {
+ return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
+ }
+
+ /**
+ * Returns chm extractor
+ *
+ * @return chmExtractor
+ */
+ private ChmExtractor getChmExtractor() {
+ return chmExtractor;
+ }
+
+ /**
+ * Sets a chm extractor
+ *
+ * @param chmExtractor
+ */
+ private void setChmExtractor(ChmExtractor chmExtractor) {
+ this.chmExtractor = chmExtractor;
+ }
+
+ /**
+ * Returns chm metadata
+ *
+ * @param metadata
+ *
+ * @throws TikaException
+ * @throws IOException
+ */
+ public void getCHMDocInformation(Metadata metadata) throws TikaException,
+ IOException {
+ if (getChmExtractor() != null) {
+ /* Checking if file is a chm, done during creating chmItsf header */
+ metadata.add(Metadata.CONTENT_TYPE, "application/x-chm");
+ } else {
+ metadata.add(Metadata.CONTENT_TYPE, "unknown");
+ }
+ }
+
+ /**
+ * Returns extracted text from chm file
+ *
+ * @return text
+ *
+ * @throws TikaException
+ */
+ public String getText() throws TikaException {
+ return getContent();
+ }
+
+ /**
+ * Extracts data from byte[][]
+ *
+ * @param byteObject
+ * @return
+ * @throws IOException
+ * @throws SAXException
+ */
+ private String extract(byte[][] byteObject) {// throws IOException
+ StringBuilder wBuf = new StringBuilder();
+ InputStream stream = null;
+ Metadata metadata = new Metadata();
+ HtmlParser htmlParser = new HtmlParser();
+ BodyContentHandler handler = new BodyContentHandler(-1);// -1
+ ParseContext parser = new ParseContext();
+ try {
+ for (int i = 0; i < byteObject.length; i++) {
+ stream = new ByteArrayInputStream(byteObject[i]);
+ try {
+ htmlParser.parse(stream, handler, metadata, parser);
+ } catch (TikaException e) {
+ wBuf.append(new String(byteObject[i]));
+// System.err.println("\n"
+// + CHMDocumentInformation.class.getName()
+// + " extract " + e.getMessage());
+ } finally {
+ wBuf.append(handler.toString()
+ + System.getProperty("line.separator"));
+ stream.close();
+ }
+ }
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {//
+ // Pushback overflow from tagsoup
+ }
+ return wBuf.toString();
+ }
+
+ public static void main(String[] args) {
+
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java Sun Sep 18 10:39:08 2011
@@ -1,56 +1,56 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class ChmParser extends AbstractParser {
-
- private static final long serialVersionUID = 5938777307516469802L;
- private static final Set<MediaType> SUPPORTED_TYPES = Collections
- .singleton(MediaType.application("chm"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
-
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- CHMDocumentInformation chmInfo = CHMDocumentInformation.load(stream);
- metadata.set(Metadata.CONTENT_TYPE, "chm");
- extractMetadata(chmInfo, metadata);
- CHM2XHTML.process(chmInfo, handler);
- }
-
- private void extractMetadata(CHMDocumentInformation chmInfo,
- Metadata metadata) throws TikaException, IOException {
- chmInfo.getCHMDocInformation(metadata);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ChmParser extends AbstractParser {
+
+ private static final long serialVersionUID = 5938777307516469802L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.application("chm"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ CHMDocumentInformation chmInfo = CHMDocumentInformation.load(stream);
+ metadata.set(Metadata.CONTENT_TYPE, "chm");
+ extractMetadata(chmInfo, metadata);
+ CHM2XHTML.process(chmInfo, handler);
+ }
+
+ private void extractMetadata(CHMDocumentInformation chmInfo,
+ Metadata metadata) throws TikaException, IOException {
+ chmInfo.getCHMDocInformation(metadata);
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Sun Sep 18 10:39:08 2011
@@ -1,39 +1,39 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.io.Serializable;
-
-import org.apache.tika.exception.TikaException;
-
-/**
- *
- * Defines an accessor interface
- *
- * @param <T>
- */
-public interface ChmAccessor<T> extends Serializable {
- /**
- * Parses chm accessor
- *
- * @param data
- * chm file
- * @param chmAccessor
- * @throws TikaException
- */
- void parse(byte[] data, T chmAccessor) throws TikaException;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ *
+ * Defines an accessor interface
+ *
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+ /**
+ * Parses chm accessor
+ *
+ * @param data
+ * chm file
+ * @param chmAccessor
+ * @throws TikaException
+ */
+ void parse(byte[] data, T chmAccessor) throws TikaException;
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Sun Sep 18 10:39:08 2011
@@ -1,397 +1,397 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.accessor;
-
-import java.math.BigInteger;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.core.ChmCommons;
-import org.apache.tika.parser.chm.core.ChmConstants;
-
-/**
- * Holds chm listing entries
- */
-public class ChmDirectoryListingSet {
- private List<DirectoryListingEntry> dlel;
- private byte[] data;
- private int placeHolder = -1;
- private long dataOffset = -1;
- private int controlDataIndex = -1;
- private int resetTableIndex = -1;
-
- private boolean isNotControlDataFound = true;
- private boolean isNotResetTableFound = true;
-
- /**
- * Constructs chm directory listing set
- *
- * @param data
- * byte[]
- * @param chmItsHeader
- * @param chmItspHeader
- * @throws TikaException
- */
- public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
- ChmItspHeader chmItspHeader) throws TikaException {
- setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
- ChmCommons.assertByteArrayNotNull(data);
- setData(data);
- enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("list:=" + getDirectoryListingEntryList().toString()
- + System.getProperty("line.separator"));
- sb.append("number of list items:="
- + getDirectoryListingEntryList().size());
- return sb.toString();
- }
-
- /**
- * Returns control data index that located in List
- *
- * @return control data index
- */
- public int getControlDataIndex() {
- return controlDataIndex;
- }
-
- /**
- * Sets control data index
- *
- * @param controlDataIndex
- */
- protected void setControlDataIndex(int controlDataIndex) {
- this.controlDataIndex = controlDataIndex;
- }
-
- /**
- * Return index of reset table
- *
- * @return reset table index
- */
- public int getResetTableIndex() {
- return resetTableIndex;
- }
-
- /**
- * Sets reset table index
- *
- * @param resetTableIndex
- */
- protected void setResetTableIndex(int resetTableIndex) {
- this.resetTableIndex = resetTableIndex;
- }
-
- /**
- * Gets place holder
- *
- * @return place holder
- */
- private int getPlaceHolder() {
- return placeHolder;
- }
-
- /**
- * Sets place holder
- *
- * @param placeHolder
- */
- private void setPlaceHolder(int placeHolder) {
- this.placeHolder = placeHolder;
- }
-
- /**
- * Enumerates chm directory listing entries
- *
- * @param chmItsHeader
- * chm itsf header
- * @param chmItspHeader
- * chm itsp header
- */
- private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
- ChmItspHeader chmItspHeader) {
- try {
- int startPmgl = chmItspHeader.getIndex_head();
- int stopPmgl = chmItspHeader.getUnknown_0024();
- int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
- .getHeader_len());
- setDataOffset(chmItsHeader.getDataOffset());
-
- /* loops over all pmgls */
- int previous_index = 0;
- byte[] dir_chunk = null;
- for (int i = startPmgl; i <= stopPmgl; i++) {
- int data_copied = ((1 + i) * (int) chmItspHeader.getBlock_len())
- + dir_offset;
- if (i == 0) {
- dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
- // dir_chunk = Arrays.copyOfRange(getData(), dir_offset,
- // (((1+i) * (int)chmItspHeader.getBlock_len()) +
- // dir_offset));
- dir_chunk = ChmCommons
- .copyOfRange(getData(), dir_offset,
- (((1 + i) * (int) chmItspHeader
- .getBlock_len()) + dir_offset));
- previous_index = data_copied;
- } else {
- dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
- // dir_chunk = Arrays.copyOfRange(getData(), previous_index,
- // (((1+i) * (int)chmItspHeader.getBlock_len()) +
- // dir_offset));
- dir_chunk = ChmCommons
- .copyOfRange(getData(), previous_index,
- (((1 + i) * (int) chmItspHeader
- .getBlock_len()) + dir_offset));
- previous_index = data_copied;
- }
- enumerateOneSegment(dir_chunk);
- dir_chunk = null;
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- setData(null);
- }
- }
-
- /**
- * Checks control data
- *
- * @param dle
- * chm directory listing entry
- */
- private void checkControlData(DirectoryListingEntry dle) {
- if (isNotControlDataFound) {
- if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
- setControlDataIndex(getDirectoryListingEntryList().size());
- isNotControlDataFound = false;
- }
- }
- }
-
- /**
- * Checks reset table
- *
- * @param dle
- * chm directory listing entry
- */
- private void checkResetTable(DirectoryListingEntry dle) {
- if (isNotResetTableFound) {
- if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
- setResetTableIndex(getDirectoryListingEntryList().size());
- isNotResetTableFound = false;
- }
- }
- }
-
- /**
- * Enumerates chm directory listing entries in single chm segment
- *
- * @param dir_chunk
- */
- private void enumerateOneSegment(byte[] dir_chunk) {
- try {
- if (dir_chunk != null) {
-
- int indexWorkData = ChmCommons.indexOf(dir_chunk,
- "::".getBytes());
- int indexUserData = ChmCommons.indexOf(dir_chunk,
- "/".getBytes());
-
- if (indexUserData < indexWorkData)
- setPlaceHolder(indexUserData);
- else
- setPlaceHolder(indexWorkData);
-
- if (getPlaceHolder() > 0
- && dir_chunk[getPlaceHolder() - 1] != 115) {// #{
- do {
- if (dir_chunk[getPlaceHolder() - 1] > 0) {
- DirectoryListingEntry dle = new DirectoryListingEntry();
-
- // two cases: 1. when dir_chunk[getPlaceHolder() -
- // 1] == 0x73
- // 2. when dir_chunk[getPlaceHolder() + 1] == 0x2f
- doNameCheck(dir_chunk, dle);
-
- // dle.setName(new
- // String(Arrays.copyOfRange(dir_chunk,
- // getPlaceHolder(), (getPlaceHolder() +
- // dle.getNameLength()))));
- dle.setName(new String(ChmCommons.copyOfRange(
- dir_chunk, getPlaceHolder(),
- (getPlaceHolder() + dle.getNameLength()))));
- checkControlData(dle);
- checkResetTable(dle);
- setPlaceHolder(getPlaceHolder()
- + dle.getNameLength());
-
- /* Sets entry type */
- if (getPlaceHolder() < dir_chunk.length
- && dir_chunk[getPlaceHolder()] == 0)
- dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
- else
- dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
-
- setPlaceHolder(getPlaceHolder() + 1);
- dle.setOffset(getEncint(dir_chunk));
- dle.setLength(getEncint(dir_chunk));
- getDirectoryListingEntryList().add(dle);
- } else
- setPlaceHolder(getPlaceHolder() + 1);
-
- } while (hasNext(dir_chunk));
- }
- }
-
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Checks if a name and name length are correct. If not then handles it as
- * follows: 1. when dir_chunk[getPlaceHolder() - 1] == 0x73 ('/') 2. when
- * dir_chunk[getPlaceHolder() + 1] == 0x2f ('s')
- *
- * @param dir_chunk
- * @param dle
- */
- private void doNameCheck(byte[] dir_chunk, DirectoryListingEntry dle) {
- if (dir_chunk[getPlaceHolder() - 1] == 0x73) {
- dle.setNameLength(dir_chunk[getPlaceHolder() - 1] & 0x21);
- } else if (dir_chunk[getPlaceHolder() + 1] == 0x2f) {
- dle.setNameLength(dir_chunk[getPlaceHolder()]);
- setPlaceHolder(getPlaceHolder() + 1);
- } else {
- dle.setNameLength(dir_chunk[getPlaceHolder() - 1]);
- }
- }
-
- /**
- * Checks if it's possible move further on byte[]
- *
- * @param dir_chunk
- *
- * @return boolean
- */
- private boolean hasNext(byte[] dir_chunk) {
- while (getPlaceHolder() < dir_chunk.length) {
- if (dir_chunk[getPlaceHolder()] == 47
- && dir_chunk[getPlaceHolder() + 1] != ':') {
- setPlaceHolder(getPlaceHolder());
- return true;
- } else if (dir_chunk[getPlaceHolder()] == ':'
- && dir_chunk[getPlaceHolder() + 1] == ':') {
- setPlaceHolder(getPlaceHolder());
- return true;
- } else
- setPlaceHolder(getPlaceHolder() + 1);
- }
- return false;
- }
-
- /**
- * Returns encrypted integer
- *
- * @param data_chunk
- *
- * @return
- */
- private int getEncint(byte[] data_chunk) {
- byte ob;
- BigInteger bi = BigInteger.ZERO;
- byte[] nb = new byte[1];
-
- if (getPlaceHolder() < data_chunk.length) {
- while ((ob = data_chunk[getPlaceHolder()]) < 0) {
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- setPlaceHolder(getPlaceHolder() + 1);
- }
- nb[0] = (byte) ((ob & 0x7f));
- bi = bi.shiftLeft(7).add(new BigInteger(nb));
- setPlaceHolder(getPlaceHolder() + 1);
- }
- return bi.intValue();
- }
-
- /**
- * @param args
- */
- public static void main(String[] args) {
- }
-
- /**
- * Sets chm directory listing entry list
- *
- * @param dlel
- * chm directory listing entry list
- */
- public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
- this.dlel = dlel;
- }
-
- /**
- * Returns chm directory listing entry list
- *
- * @return List<DirectoryListingEntry>
- */
- public List<DirectoryListingEntry> getDirectoryListingEntryList() {
- return dlel;
- }
-
- /**
- * Sets data
- *
- * @param data
- */
- private void setData(byte[] data) {
- this.data = data;
- }
-
- /**
- * Returns data
- *
- * @return
- */
- private byte[] getData() {
- return data;
- }
-
- /**
- * Sets data offset
- *
- * @param dataOffset
- */
- private void setDataOffset(long dataOffset) {
- this.dataOffset = dataOffset;
- }
-
- /**
- * Returns data offset
- *
- * @return dataOffset
- */
- public long getDataOffset() {
- return dataOffset;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+ private List<DirectoryListingEntry> dlel;
+ private byte[] data;
+ private int placeHolder = -1;
+ private long dataOffset = -1;
+ private int controlDataIndex = -1;
+ private int resetTableIndex = -1;
+
+ private boolean isNotControlDataFound = true;
+ private boolean isNotResetTableFound = true;
+
+ /**
+ * Constructs chm directory listing set
+ *
+ * @param data
+ * byte[]
+ * @param chmItsHeader
+ * @param chmItspHeader
+ * @throws TikaException
+ */
+ public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) throws TikaException {
+ setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+ ChmCommons.assertByteArrayNotNull(data);
+ setData(data);
+ enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("list:=" + getDirectoryListingEntryList().toString()
+ + System.getProperty("line.separator"));
+ sb.append("number of list items:="
+ + getDirectoryListingEntryList().size());
+ return sb.toString();
+ }
+
+ /**
+ * Returns control data index that located in List
+ *
+ * @return control data index
+ */
+ public int getControlDataIndex() {
+ return controlDataIndex;
+ }
+
+ /**
+ * Sets control data index
+ *
+ * @param controlDataIndex
+ */
+ protected void setControlDataIndex(int controlDataIndex) {
+ this.controlDataIndex = controlDataIndex;
+ }
+
+ /**
+ * Return index of reset table
+ *
+ * @return reset table index
+ */
+ public int getResetTableIndex() {
+ return resetTableIndex;
+ }
+
+ /**
+ * Sets reset table index
+ *
+ * @param resetTableIndex
+ */
+ protected void setResetTableIndex(int resetTableIndex) {
+ this.resetTableIndex = resetTableIndex;
+ }
+
+ /**
+ * Gets place holder
+ *
+ * @return place holder
+ */
+ private int getPlaceHolder() {
+ return placeHolder;
+ }
+
+ /**
+ * Sets place holder
+ *
+ * @param placeHolder
+ */
+ private void setPlaceHolder(int placeHolder) {
+ this.placeHolder = placeHolder;
+ }
+
+ /**
+ * Enumerates chm directory listing entries
+ *
+ * @param chmItsHeader
+ * chm itsf header
+ * @param chmItspHeader
+ * chm itsp header
+ */
+ private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) {
+ try {
+ int startPmgl = chmItspHeader.getIndex_head();
+ int stopPmgl = chmItspHeader.getUnknown_0024();
+ int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+ .getHeader_len());
+ setDataOffset(chmItsHeader.getDataOffset());
+
+ /* loops over all pmgls */
+ int previous_index = 0;
+ byte[] dir_chunk = null;
+ for (int i = startPmgl; i <= stopPmgl; i++) {
+ int data_copied = ((1 + i) * (int) chmItspHeader.getBlock_len())
+ + dir_offset;
+ if (i == 0) {
+ dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+ // dir_chunk = Arrays.copyOfRange(getData(), dir_offset,
+ // (((1+i) * (int)chmItspHeader.getBlock_len()) +
+ // dir_offset));
+ dir_chunk = ChmCommons
+ .copyOfRange(getData(), dir_offset,
+ (((1 + i) * (int) chmItspHeader
+ .getBlock_len()) + dir_offset));
+ previous_index = data_copied;
+ } else {
+ dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+ // dir_chunk = Arrays.copyOfRange(getData(), previous_index,
+ // (((1+i) * (int)chmItspHeader.getBlock_len()) +
+ // dir_offset));
+ dir_chunk = ChmCommons
+ .copyOfRange(getData(), previous_index,
+ (((1 + i) * (int) chmItspHeader
+ .getBlock_len()) + dir_offset));
+ previous_index = data_copied;
+ }
+ enumerateOneSegment(dir_chunk);
+ dir_chunk = null;
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ setData(null);
+ }
+ }
+
+ /**
+ * Checks control data
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkControlData(DirectoryListingEntry dle) {
+ if (isNotControlDataFound) {
+ if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+ setControlDataIndex(getDirectoryListingEntryList().size());
+ isNotControlDataFound = false;
+ }
+ }
+ }
+
+ /**
+ * Checks reset table
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkResetTable(DirectoryListingEntry dle) {
+ if (isNotResetTableFound) {
+ if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+ setResetTableIndex(getDirectoryListingEntryList().size());
+ isNotResetTableFound = false;
+ }
+ }
+ }
+
+ /**
+ * Enumerates chm directory listing entries in single chm segment
+ *
+ * @param dir_chunk
+ */
+ private void enumerateOneSegment(byte[] dir_chunk) {
+ try {
+ if (dir_chunk != null) {
+
+ int indexWorkData = ChmCommons.indexOf(dir_chunk,
+ "::".getBytes());
+ int indexUserData = ChmCommons.indexOf(dir_chunk,
+ "/".getBytes());
+
+ if (indexUserData < indexWorkData)
+ setPlaceHolder(indexUserData);
+ else
+ setPlaceHolder(indexWorkData);
+
+ if (getPlaceHolder() > 0
+ && dir_chunk[getPlaceHolder() - 1] != 115) {// #{
+ do {
+ if (dir_chunk[getPlaceHolder() - 1] > 0) {
+ DirectoryListingEntry dle = new DirectoryListingEntry();
+
+ // two cases: 1. when dir_chunk[getPlaceHolder() -
+ // 1] == 0x73
+ // 2. when dir_chunk[getPlaceHolder() + 1] == 0x2f
+ doNameCheck(dir_chunk, dle);
+
+ // dle.setName(new
+ // String(Arrays.copyOfRange(dir_chunk,
+ // getPlaceHolder(), (getPlaceHolder() +
+ // dle.getNameLength()))));
+ dle.setName(new String(ChmCommons.copyOfRange(
+ dir_chunk, getPlaceHolder(),
+ (getPlaceHolder() + dle.getNameLength()))));
+ checkControlData(dle);
+ checkResetTable(dle);
+ setPlaceHolder(getPlaceHolder()
+ + dle.getNameLength());
+
+ /* Sets entry type */
+ if (getPlaceHolder() < dir_chunk.length
+ && dir_chunk[getPlaceHolder()] == 0)
+ dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+ else
+ dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+ setPlaceHolder(getPlaceHolder() + 1);
+ dle.setOffset(getEncint(dir_chunk));
+ dle.setLength(getEncint(dir_chunk));
+ getDirectoryListingEntryList().add(dle);
+ } else
+ setPlaceHolder(getPlaceHolder() + 1);
+
+ } while (hasNext(dir_chunk));
+ }
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Checks if a name and name length are correct. If not then handles it as
+ * follows: 1. when dir_chunk[getPlaceHolder() - 1] == 0x73 ('/') 2. when
+ * dir_chunk[getPlaceHolder() + 1] == 0x2f ('s')
+ *
+ * @param dir_chunk
+ * @param dle
+ */
+ private void doNameCheck(byte[] dir_chunk, DirectoryListingEntry dle) {
+ if (dir_chunk[getPlaceHolder() - 1] == 0x73) {
+ dle.setNameLength(dir_chunk[getPlaceHolder() - 1] & 0x21);
+ } else if (dir_chunk[getPlaceHolder() + 1] == 0x2f) {
+ dle.setNameLength(dir_chunk[getPlaceHolder()]);
+ setPlaceHolder(getPlaceHolder() + 1);
+ } else {
+ dle.setNameLength(dir_chunk[getPlaceHolder() - 1]);
+ }
+ }
+
+ /**
+ * Checks if it's possible move further on byte[]
+ *
+ * @param dir_chunk
+ *
+ * @return boolean
+ */
+ private boolean hasNext(byte[] dir_chunk) {
+ while (getPlaceHolder() < dir_chunk.length) {
+ if (dir_chunk[getPlaceHolder()] == 47
+ && dir_chunk[getPlaceHolder() + 1] != ':') {
+ setPlaceHolder(getPlaceHolder());
+ return true;
+ } else if (dir_chunk[getPlaceHolder()] == ':'
+ && dir_chunk[getPlaceHolder() + 1] == ':') {
+ setPlaceHolder(getPlaceHolder());
+ return true;
+ } else
+ setPlaceHolder(getPlaceHolder() + 1);
+ }
+ return false;
+ }
+
+ /**
+ * Returns encrypted integer
+ *
+ * @param data_chunk
+ *
+ * @return
+ */
+ private int getEncint(byte[] data_chunk) {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+
+ if (getPlaceHolder() < data_chunk.length) {
+ while ((ob = data_chunk[getPlaceHolder()]) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(getPlaceHolder() + 1);
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(getPlaceHolder() + 1);
+ }
+ return bi.intValue();
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ }
+
+ /**
+ * Sets chm directory listing entry list
+ *
+ * @param dlel
+ * chm directory listing entry list
+ */
+ public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+ this.dlel = dlel;
+ }
+
+ /**
+ * Returns chm directory listing entry list
+ *
+ * @return List<DirectoryListingEntry>
+ */
+ public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+ return dlel;
+ }
+
+ /**
+ * Sets data
+ *
+ * @param data
+ */
+ private void setData(byte[] data) {
+ this.data = data;
+ }
+
+ /**
+ * Returns data
+ *
+ * @return
+ */
+ private byte[] getData() {
+ return data;
+ }
+
+ /**
+ * Sets data offset
+ *
+ * @param dataOffset
+ */
+ private void setDataOffset(long dataOffset) {
+ this.dataOffset = dataOffset;
+ }
+
+ /**
+ * Returns data offset
+ *
+ * @return dataOffset
+ */
+ public long getDataOffset() {
+ return dataOffset;
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
------------------------------------------------------------------------------
svn:eol-style = native