You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2012/11/07 16:05:14 UTC
svn commit: r1406663 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
Author: jukka
Date: Wed Nov 7 15:05:14 2012
New Revision: 1406663
URL: http://svn.apache.org/viewvc?rev=1406663&view=rev
Log:
TIKA-1009: Expose TextDocument in BoilerpipeContentHandler
Patch by Markus Jelsma
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1406663&r1=1406662&r2=1406663&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Nov 7 15:05:14 2012
@@ -40,7 +40,7 @@ import de.l3s.boilerpipe.sax.BoilerpipeH
/**
* Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
* library to automatically extract the main content from a web page.
- *
+ *
* Use this as a {@link ContentHandler} object passed to
* {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
*/
@@ -52,7 +52,7 @@ public class BoilerpipeContentHandler ex
END,
CONTINUE
}
-
+
private String uri;
private String localName;
private String qName;
@@ -63,15 +63,15 @@ public class BoilerpipeContentHandler ex
public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
this(uri, localName, qName, attrs, ElementType.START);
}
-
+
public RecordedElement(String uri, String localName, String qName) {
this(uri, localName, qName, null, ElementType.END);
}
-
+
public RecordedElement() {
this(null, null, null, null, ElementType.CONTINUE);
}
-
+
protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
this.uri = uri;
this.localName = localName;
@@ -85,7 +85,7 @@ public class BoilerpipeContentHandler ex
public String toString() {
return String.format("<%s> of type %s", localName, elementType);
};
-
+
public String getUri() {
return uri;
}
@@ -105,12 +105,12 @@ public class BoilerpipeContentHandler ex
public List<char[]> getCharacters() {
return characters;
}
-
+
public RecordedElement.ElementType getElementType() {
return elementType;
}
}
-
+
/**
* The newline character that gets inserted after block elements.
*/
@@ -118,17 +118,18 @@ public class BoilerpipeContentHandler ex
private ContentHandler delegate;
private BoilerpipeExtractor extractor;
-
+
private boolean includeMarkup;
private boolean inHeader;
private boolean inFooter;
private int headerCharOffset;
private List<RecordedElement> elements;
-
+ private TextDocument td;
+
/**
* Creates a new boilerpipe-based content extractor, using the
* {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
- *
+ *
* @param delegate
* The {@link ContentHandler} object
*/
@@ -150,13 +151,14 @@ public class BoilerpipeContentHandler ex
* Creates a new boilerpipe-based content extractor, using the given
* extraction rules. The extracted main content will be passed to the
* <delegate> content handler.
- *
+ *
* @param delegate
* The {@link ContentHandler} object
* @param extractor
* Extraction rules to use, e.g. {@link ArticleExtractor}
*/
public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+ this.td = null;
this.delegate = delegate;
this.extractor = extractor;
}
@@ -164,36 +166,45 @@ public class BoilerpipeContentHandler ex
public void setIncludeMarkup(boolean includeMarkup) {
this.includeMarkup = includeMarkup;
}
-
+
public boolean isIncludeMarkup() {
return includeMarkup;
}
-
+
+ /**
+ * Retrieves the built TextDocument
+ *
+ * @return TextDocument
+ */
+ public TextDocument getTextDocument() {
+ return td;
+ }
+
@Override
public void startDocument() throws SAXException {
super.startDocument();
-
+
delegate.startDocument();
-
+
inHeader = true;
inFooter = false;
headerCharOffset = 0;
-
+
if (includeMarkup) {
elements = new ArrayList<RecordedElement>();
}
};
-
+
@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
super.startPrefixMapping(prefix, uri);
delegate.startPrefixMapping(prefix, uri);
};
-
+
@Override
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
super.startElement(uri, localName, qName, atts);
-
+
if (inHeader) {
delegate.startElement(uri, localName, qName, atts);
} else if (inFooter) {
@@ -205,11 +216,11 @@ public class BoilerpipeContentHandler ex
delegate.startElement(uri, localName, qName, atts);
}
};
-
+
@Override
public void characters(char[] chars, int offset, int length) throws SAXException {
super.characters(chars, offset, length);
-
+
if (inHeader) {
delegate.characters(chars, offset, length);
headerCharOffset++;
@@ -217,17 +228,17 @@ public class BoilerpipeContentHandler ex
// Do nothing
} else if (includeMarkup) {
RecordedElement element = elements.get(elements.size() - 1);
-
+
char[] characters = new char[length];
System.arraycopy(chars, offset, characters, 0, length);
element.getCharacters().add(characters);
}
};
-
+
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
super.endElement(uri, localName, qName);
-
+
if (inHeader) {
delegate.endElement(uri, localName, qName);
inHeader = !localName.equals("head");
@@ -241,18 +252,18 @@ public class BoilerpipeContentHandler ex
elements.add(new RecordedElement());
}
};
-
+
@Override
public void endDocument() throws SAXException {
super.endDocument();
- TextDocument td = toTextDocument();
+ td = toTextDocument();
try {
extractor.process(td);
} catch (BoilerpipeProcessingException e) {
throw new SAXException(e);
}
-
+
Attributes emptyAttrs = new AttributesImpl();
// At this point we have all the information we need to either emit N paragraphs
@@ -268,7 +279,7 @@ public class BoilerpipeContentHandler ex
}
}
}
-
+
// Now have bits set for all valid character runs. Replay our recorded elements,
// but only emit character runs flagged as valid.
int curCharsIndex = headerCharOffset;
@@ -277,28 +288,28 @@ public class BoilerpipeContentHandler ex
case START:
delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
// Fall through
-
+
case CONTINUE:
// Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
// we have to follow suit.
for (char[] chars : element.getCharacters()) {
curCharsIndex++;
-
+
if (validCharacterRuns.get(curCharsIndex)) {
delegate.characters(chars, 0, chars.length);
}
}
break;
-
+
case END:
delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
break;
-
+
default:
throw new RuntimeException("Unhandled element type: " + element.getElementType());
}
-
-
+
+
}
} else {
for (TextBlock block : td.getTextBlocks()) {
@@ -311,10 +322,10 @@ public class BoilerpipeContentHandler ex
}
}
}
-
+
delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
-
+
// We defer ending any prefix mapping until here, which is why we don't pass this
// through to the delegate in an overridden method.
delegate.endPrefixMapping("");