You are viewing a plain text version of this content. The canonical link for it is here.
Posted to taglibs-dev@jakarta.apache.org by fe...@apache.org on 2004/03/24 05:18:17 UTC
cvs commit: jakarta-taglibs/scrape/xml intro.xml scrape.xml
felipeal 2004/03/23 20:18:17
Modified: scrape/src/org/apache/taglibs/scrape PageData.java
PageTag.java
scrape/xml intro.xml scrape.xml
Log:
added 'charset' attribute (see bug 24611) - thanks Ricardo Caetano for reporting this bug and sending the fix
Revision Changes Path
1.15 +22 -7 jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageData.java
Index: PageData.java
===================================================================
RCS file: /home/cvs/jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageData.java,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -r1.14 -r1.15
--- PageData.java 29 Feb 2004 06:45:22 -0000 1.14
+++ PageData.java 24 Mar 2004 04:18:17 -0000 1.15
@@ -509,10 +509,11 @@
* @param time length of time to wait before rescrape
* @param proxy boolean value that says whether or not to use a proxy server
* @param pc PageContext for this JSP page
+ * @param cs charset to be used to scrape the page
*
*/
- public void scrapePage(String url, long time, PageContext pc) throws JspException
- {
+ public void scrapePage(String url, long time, PageContext pc, String cs)
+ throws JspException {
long currenttime = new Date().getTime(); // get the current time
// check to see if a scrape is needed
@@ -528,7 +529,7 @@
// create thread page if it doesn't exist check for a
// proxy connection
try {
- page = new Page(url, this, pc);
+ page = new Page(url, this, pc, cs);
/*if (pport != -1 && pserver != null)
page = new Page(url, this, pc, pport, pserver, auth);
//page = new Page(url, this, pc, pport, pserver, ssl, auth);
@@ -608,6 +609,8 @@
// boolean value determines if the connection to to travel via a secure
// connection
private boolean ssl = false;
+ // charset to be used to scrape the page
+ private String charset = null;
/**
* Constructor for Page
@@ -616,11 +619,12 @@
* @param page PageData object for the page to get scraped
* @param pc PageContext the taglibrary is running in used for logging
* @param secure boolean flag to determine if the connection is via http of https
+ * @param cs charset to be used to scrape the page
*
* @throws MalformedURLException -
*
*/
- Page(String url, PageData page, PageContext pc)
+ Page(String url, PageData page, PageContext pc, String cs)
//Page(String url, PageData page, PageContext pc, boolean secure)
throws MalformedURLException {
this.url = new URL(url);
@@ -630,6 +634,7 @@
this.url = new URL(url + "/");
pagedata = page;
pageContext = pc;
+ charset = cs;
//ssl = secure;
}
@@ -734,7 +739,7 @@
// read http request into buffer return value is false
// if an error occured
- if (streamtochararray(connection.getInputStream())) {
+ if (streamtochararray(connection.getInputStream(),charset)) {
// perform the scrapes on this page
scrape();
}
@@ -758,12 +763,22 @@
* otherwise false
*
*/
- private boolean streamtochararray(InputStream in) {
+ private boolean streamtochararray(InputStream in, String charset) {
long sourcelength = 50000; // length of buffer inputstream is read into
StringBuffer temp; // buffer used to chop unused portion of source
boolean returnvalue = true; // no error in reading from input stream
// create a char stream from a byte stream
- InputStreamReader input = new InputStreamReader(in);
+ InputStreamReader input = null;
+ if ( charset == null ) {
+ input = new InputStreamReader(in);
+ } else {
+ try {
+ input = new InputStreamReader(in, charset);
+ } catch( UnsupportedEncodingException exc ) {
+ System.err.println( "WARNING: unsupported charset " + charset + ". Using default." );
+ input = new InputStreamReader(in);
+ }
+ }
boolean chop = false; // flag tells whether or not to truncate buffer
int offset = 0; // offset in the input stream to start reading from
int num; // number of chars read from the input stream
1.8 +18 -1 jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageTag.java
Index: PageTag.java
===================================================================
RCS file: /home/cvs/jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageTag.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- PageTag.java 29 Feb 2004 06:45:22 -0000 1.7
+++ PageTag.java 24 Mar 2004 04:18:17 -0000 1.8
@@ -75,6 +75,11 @@
* <required>false</required>
* <rtexprval>false</rtexprval>
* </attribute>
+ * <attribute>
+ * < name>charset</name>
+ * <required>false</required>
+ * <rtexprval>true</rtexprval>
+ * </attribute>
* </pre></p></p>
*
* @author Rich Catlett
@@ -108,6 +113,9 @@
private boolean ssl = false;
// the password to the client keystore for client side ssl authentication
private String sslpass = null;
+ // charset of the page scrapped
+ private String charset = null;
+
/**
* implementation of method from the tag interface that tells the JSP what
@@ -141,7 +149,7 @@
*/
public final int doEndTag() throws JspException {
// attempt to scrape from the page named by url
- pagedata.scrapePage(url, time, pageContext);
+ pagedata.scrapePage(url, time, pageContext, charset);
// put scrape results in the pagescope for access by result tag
putScrapes();
return EVAL_PAGE;
@@ -272,6 +280,15 @@
pagedata.setHeader(name, value);
}
+ /**
+ * set the value of the charset to be used
+ *
+ * @param value charset to be used to scrape the page
+ *
+ */
+ public final void setCharset(String value) {
+ charset = value;
+ }
/**
* method sets the scrapedata object in the hashmap scrapes in the
* application scope pagedata object
1.4 +1 -0 jakarta-taglibs/scrape/xml/intro.xml
Index: intro.xml
===================================================================
RCS file: /home/cvs/jakarta-taglibs/scrape/xml/intro.xml,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- intro.xml 12 Mar 2004 04:14:12 -0000 1.3
+++ intro.xml 24 Mar 2004 04:18:17 -0000 1.4
@@ -70,6 +70,7 @@
<ul>
<li>Rich Catlett</li>
<li>Glenn Nielsen</li>
+ <li>Felipe Leme</li>
</ul>
</section>
1.6 +16 -0 jakarta-taglibs/scrape/xml/scrape.xml
Index: scrape.xml
===================================================================
RCS file: /home/cvs/jakarta-taglibs/scrape/xml/scrape.xml,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- scrape.xml 12 Mar 2004 04:14:12 -0000 1.5
+++ scrape.xml 24 Mar 2004 04:18:17 -0000 1.6
@@ -206,6 +206,16 @@
<availability>1.0</availability>
</attribute>
+ <attribute>
+ <name>charset</name>
+ <required>no</required>
+ <rtexprvalue>no</rtexprvalue>
+ <description>
+ Charset used by the scraped page. This attribute is useful when the page being scrapped uses a different charset than the web server.
+ </description>
+ <availability>1.0</availability>
+ </attribute>
+
<example>
<usage>
<comment>
@@ -539,6 +549,12 @@
</tagtoc>
</taglib>
+
+<revision release="Nightly build" date="04/24/2004">
+ <description>
+ Added <code>charset</code> attribute (see bug 24611)
+ </description>
+</revision>
<revision release="Pre Beta" date="07/22/2001">
<description>
---------------------------------------------------------------------
To unsubscribe, e-mail: taglibs-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: taglibs-dev-help@jakarta.apache.org