You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tika.apache.org by "Gabriele Kahlout (JIRA)" <ji...@apache.org> on 2011/06/18 14:39:47 UTC
[jira] [Created] (TIKA-676) Boilerpipe fails
Boilerpipe fails
----------------
Key: TIKA-676
URL: https://issues.apache.org/jira/browse/TIKA-676
Project: Tika
Issue Type: Bug
Reporter: Gabriele Kahlout
Priority: Minor
Fix For: 1.0
This is apparently a boilerpipe issue, they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
{code}
$ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
{code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Christian Kohlschütter (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13060694#comment-13060694 ]
Christian Kohlschütter commented on TIKA-676:
---------------------------------------------
Could you please try again with boilerpipe-core 1.2.0? (just released today, bringing it up-to-date with the boilerpipe-web API)
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Markus Jelsma (Commented) (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13174770#comment-13174770 ]
Markus Jelsma commented on TIKA-676:
------------------------------------
The latest artifact is still not published on central. Can we publish it there through some other means?
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators: https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Updated] (TIKA-676) Boilerpipe fails
Posted by "Chris A. Mattmann (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Chris A. Mattmann updated TIKA-676:
-----------------------------------
Component/s: parser
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Components: parser
> Reporter: Gabriele Kahlout
> Priority: Minor
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators: https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Updated] (TIKA-676) Boilerpipe fails
Posted by "Jukka Zitting (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Jukka Zitting updated TIKA-676:
-------------------------------
Fix Version/s: (was: 0.10)
Removed the 0.10 target version. We'll ship the fix once it's available.
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Markus Jelsma (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13065220#comment-13065220 ]
Markus Jelsma commented on TIKA-676:
------------------------------------
Good work! Upgrading to BoilerPipe 1.2.0 fixes the issue although it still complains about the NekoHTML bug. I had to upgrade manually though as maven won't find the 1.2.0 artifact. Would be great if it could be published, then the tika-parser pom can also be updated.
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Updated] (TIKA-676) Boilerpipe fails
Posted by "Gabriele Kahlout (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Gabriele Kahlout updated TIKA-676:
----------------------------------
Description:
This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
{code}
$ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
{code}
was:
This is apparently a boilerpipe issue, they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
{code}
$ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
{code}
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Jukka Zitting (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13088377#comment-13088377 ]
Jukka Zitting commented on TIKA-676:
------------------------------------
We can only update the boilerpipe dependency once the new version hits Maven Central.
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Updated] (TIKA-676) Boilerpipe fails
Posted by "Markus Jelsma (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Markus Jelsma updated TIKA-676:
-------------------------------
Comment: was deleted
(was: BTW, i also confirmed that BoilerPipe 1.2.0 fixes an EmptyStackException issue for other pages:
{code}
2011-07-14 14:18:39,635 ERROR tika.TikaParser - Error parsing http://www.botje.nl
java.util.EmptyStackException
at java.util.Stack.peek(Stack.java:85)
at java.util.Stack.pop(Stack.java:67)
at org.apache.nutch.parse.tika.DOMBuilder.endElement(DOMBuilder.java:349)
at org.apache.tika.parser.html.BoilerpipeContentHandler.endDocument(BoilerpipeContentHandler.java:315)
at org.apache.tika.sax.ContentHandlerDecorator.endDocument(ContentHandlerDecorator.java:115)
at org.apache.tika.sax.XHTMLContentHandler.endDocument(XHTMLContentHandler.java:212)
at org.apache.tika.sax.TextContentHandler.endDocument(TextContentHandler.java:57)
at org.apache.tika.sax.ContentHandlerDecorator.endDocument(ContentHandlerDecorator.java:115)
at org.ccil.cowan.tagsoup.Parser.eof(Parser.java:639)
at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:589)
at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
at org.apache.nutch.parse.tika.TikaParser.getParse(TikaParser.java:115)
at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:35)
at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:24)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:303)
at java.util.concurrent.FutureTask.run(FutureTask.java:138)
at java.lang.Thread.run(Thread.java:662)
{code})
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Jukka Zitting (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13089401#comment-13089401 ]
Jukka Zitting commented on TIKA-676:
------------------------------------
See [1] for why can't/shouldn't depend on external repositories.
[1] http://www.sonatype.com/people/2010/03/why-external-repos-are-being-phased-out-of-central/
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Markus Jelsma (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13089408#comment-13089408 ]
Markus Jelsma commented on TIKA-676:
------------------------------------
Makes sense, thanks!
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Jukka Zitting (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13404510#comment-13404510 ]
Jukka Zitting commented on TIKA-676:
------------------------------------
bq. Can we publish it there through some other means?
Yes, see https://docs.sonatype.org/display/Repository/Uploading+3rd-party+Artifacts+to+The+Central+Repository for instructions.
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators: https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Markus Jelsma (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13086370#comment-13086370 ]
Markus Jelsma commented on TIKA-676:
------------------------------------
Is this going to be integrated with Tika 1.0? Is the BP 1.2.0 artifact going to be published?
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Markus Jelsma (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13065229#comment-13065229 ]
Markus Jelsma commented on TIKA-676:
------------------------------------
BTW, i also confirmed that BoilerPipe 1.2.0 fixes an EmptyStackException issue for other pages:
{code}
2011-07-14 14:18:39,635 ERROR tika.TikaParser - Error parsing http://www.botje.nl
java.util.EmptyStackException
at java.util.Stack.peek(Stack.java:85)
at java.util.Stack.pop(Stack.java:67)
at org.apache.nutch.parse.tika.DOMBuilder.endElement(DOMBuilder.java:349)
at org.apache.tika.parser.html.BoilerpipeContentHandler.endDocument(BoilerpipeContentHandler.java:315)
at org.apache.tika.sax.ContentHandlerDecorator.endDocument(ContentHandlerDecorator.java:115)
at org.apache.tika.sax.XHTMLContentHandler.endDocument(XHTMLContentHandler.java:212)
at org.apache.tika.sax.TextContentHandler.endDocument(TextContentHandler.java:57)
at org.apache.tika.sax.ContentHandlerDecorator.endDocument(ContentHandlerDecorator.java:115)
at org.ccil.cowan.tagsoup.Parser.eof(Parser.java:639)
at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:589)
at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
at org.apache.nutch.parse.tika.TikaParser.getParse(TikaParser.java:115)
at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:35)
at org.apache.nutch.parse.ParseCallable.call(ParseCallable.java:24)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:303)
at java.util.concurrent.FutureTask.run(FutureTask.java:138)
at java.lang.Thread.run(Thread.java:662)
{code}
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira
[jira] [Commented] (TIKA-676) Boilerpipe fails
Posted by "Markus Jelsma (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/TIKA-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13089385#comment-13089385 ]
Markus Jelsma commented on TIKA-676:
------------------------------------
I've asked Christian to push it to central but he also pointed to the repo at google code:
http://boilerpipe.googlecode.com/svn/repo/de/l3s/boilerpipe/boilerpipe/
> Boilerpipe fails
> ----------------
>
> Key: TIKA-676
> URL: https://issues.apache.org/jira/browse/TIKA-676
> Project: Tika
> Issue Type: Bug
> Reporter: Gabriele Kahlout
> Priority: Minor
> Fix For: 1.0
>
>
> This is apparently a [boilerpipe issue |http://code.google.com/p/boilerpipe/issues/detail?id=24 ], they fixed in the [Web API edition | http://boilerpipe-web.appspot.com/].
> {code}
> $ curl --fail -L http://thisrecording.com/the-past | java -jar tika-app-0.9.jar -T
> % Total % Received % Xferd Average Speed Time Time Time Current
> Dload Upload Total Spent Left Speed
> 100 65688 0 65688 0 0 17650 0 --:--:-- 0:00:03 --:--:-- 18698Exception in thread "main" org.xml.sax.SAXException: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again
> 100 128k 0 128k 0 0 32019 0 --:--:-- 0:00:04 --:--:-- 33735
> at de.l3s.boilerpipe.sax.CommonTagActions$2.start(CommonTagActions.java:108)
> at de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.startElement(BoilerpipeHTMLContentHandler.java:169)
> at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:195)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
> at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:279)
> at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:197)
> at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:135)
> at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
> at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
> at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
> at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
> at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
> at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:565)
> at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
> at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:198)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
> at org.apache.tika.cli.TikaCLI$OutputType.process(TikaCLI.java:107)
> at org.apache.tika.cli.TikaCLI.process(TikaCLI.java:288)
> at org.apache.tika.cli.TikaCLI.main(TikaCLI.java:94)
> {code}
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira