You are viewing a plain text version of this content. The canonical link for it is here.
Posted to taglibs-dev@jakarta.apache.org by fe...@apache.org on 2004/03/24 05:18:17 UTC
cvs commit: jakarta-taglibs/scrape/xml intro.xml scrape.xml

felipeal    2004/03/23 20:18:17

  Modified:    scrape/src/org/apache/taglibs/scrape PageData.java
                        PageTag.java
               scrape/xml intro.xml scrape.xml
  Log:
  added 'charset' attribute (see bug 24611) - thanks Ricardo Caetano for reporting this bug and sending the fix
  
  Revision  Changes    Path
  1.15      +22 -7     jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageData.java
  
  Index: PageData.java
  ===================================================================
  RCS file: /home/cvs/jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageData.java,v
  retrieving revision 1.14
  retrieving revision 1.15
  diff -u -r1.14 -r1.15
  --- PageData.java	29 Feb 2004 06:45:22 -0000	1.14
  +++ PageData.java	24 Mar 2004 04:18:17 -0000	1.15
  @@ -509,10 +509,11 @@
        * @param time  length of time to wait before rescrape
        * @param proxy  boolean value that says whether or not to use a proxy server
        * @param pc  PageContext for this JSP page
  +     * @param cs charset to be used to scrape the page
        *
        */
  -   public void scrapePage(String url, long time, PageContext pc) throws JspException 
  -   {
  +   public void scrapePage(String url, long time, PageContext pc, String cs)
  +     throws JspException {
   	long currenttime = new Date().getTime();  // get the current time
   
   	// check to see if a scrape is needed
  @@ -528,7 +529,7 @@
                           // create thread page if it doesn't exist check for a
   			// proxy connection
   			try {
  -                            page = new Page(url, this, pc);
  +                            page = new Page(url, this, pc, cs);
   			    /*if (pport != -1 && pserver != null)
   				page = new Page(url, this, pc, pport, pserver, auth);
   			 //page = new Page(url, this, pc, pport, pserver, ssl, auth);
  @@ -608,6 +609,8 @@
       // boolean value determines if the connection to to travel via a secure
       // connection
       private boolean ssl = false;
  +    // charset to be used to scrape the page
  +    private String charset = null;
   
       /**
        * Constructor for Page
  @@ -616,11 +619,12 @@
        * @param page  PageData object for the page to get scraped
        * @param pc  PageContext the taglibrary is running in used for logging
        * @param secure boolean flag to determine if the connection is via http of https
  +     * @param cs charset to be used to scrape the page
        *
        * @throws MalformedURLException - 
        *
        */
  -    Page(String url, PageData page, PageContext pc) 
  +    Page(String url, PageData page, PageContext pc, String cs) 
   	//Page(String url, PageData page, PageContext pc, boolean secure) 
   	throws MalformedURLException {
   	this.url = new URL(url);
  @@ -630,6 +634,7 @@
               this.url = new URL(url + "/");
   	pagedata = page;
   	pageContext = pc;
  +    charset = cs;
   	//ssl = secure;
       }
   
  @@ -734,7 +739,7 @@
   
   			 // read http request into buffer return value is false
   			 // if an error occured
  -			 if (streamtochararray(connection.getInputStream())) {
  +			 if (streamtochararray(connection.getInputStream(),charset)) {
   			     // perform the scrapes on this page
   		       	     scrape();
   			 }
  @@ -758,12 +763,22 @@
        *         otherwise false
        *
        */
  -    private boolean streamtochararray(InputStream in) {
  +    private boolean streamtochararray(InputStream in, String charset) {
           long sourcelength = 50000; // length of buffer inputstream is read into
   	StringBuffer temp; // buffer used to chop unused portion of source
   	boolean returnvalue = true;  // no error in reading from input stream
   	// create a char stream from a byte stream
  -	InputStreamReader input = new InputStreamReader(in); 
  +	InputStreamReader input = null;
  +    if ( charset == null ) {
  +      input = new InputStreamReader(in);
  +    } else {
  +      try {
  +        input = new InputStreamReader(in, charset);
  +      } catch( UnsupportedEncodingException exc ) {
  +        System.err.println( "WARNING: unsupported charset " + charset + ". Using default." );
  +        input = new InputStreamReader(in);
  +      }
  +    }
   	boolean chop = false; // flag tells whether or not to truncate buffer
   	int offset = 0; // offset in the input stream to start reading from
   	int num; // number of chars read from the input stream
  
  
  
  1.8       +18 -1     jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageTag.java
  
  Index: PageTag.java
  ===================================================================
  RCS file: /home/cvs/jakarta-taglibs/scrape/src/org/apache/taglibs/scrape/PageTag.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- PageTag.java	29 Feb 2004 06:45:22 -0000	1.7
  +++ PageTag.java	24 Mar 2004 04:18:17 -0000	1.8
  @@ -75,6 +75,11 @@
    *        &lt;required&gt;false&lt;/required&gt;
    *        &lt;rtexprval&gt;false&lt;/rtexprval&gt;
    * &lt;/attribute&gt;
  + * &lt;attribute&gt;
  + *        &lt; name&gt;charset&lt;/name&gt;
  + *        &lt;required&gt;false&lt;/required&gt;
  + *        &lt;rtexprval&gt;true&lt;/rtexprval&gt;
  + * &lt;/attribute&gt;
    * </pre></p></p>
    *
    * @author Rich Catlett
  @@ -108,6 +113,9 @@
       private boolean ssl = false;
       // the password to the client keystore for client side ssl authentication
       private String sslpass = null;
  +    // charset of the page scrapped
  +    private String charset = null;
  +
   
       /**
        * implementation of method from the tag interface that tells the JSP what
  @@ -141,7 +149,7 @@
        */
       public final int doEndTag() throws JspException {
           // attempt to scrape from the page named by url
  -	pagedata.scrapePage(url, time, pageContext);
  +	pagedata.scrapePage(url, time, pageContext, charset);
   	// put scrape results in the pagescope for access by result tag
   	putScrapes();
   	return EVAL_PAGE;
  @@ -272,6 +280,15 @@
   	pagedata.setHeader(name, value);
       }
   
  +    /**
  +     * set the value of the charset to be used 
  +     *
  +     * @param value charset to be used to scrape the page
  +     *
  +     */
  +    public final void setCharset(String value) {
  +	charset = value;
  +    }
       /**
        * method sets the scrapedata object in the hashmap scrapes in the
        * application scope pagedata object
  
  
  
  1.4       +1 -0      jakarta-taglibs/scrape/xml/intro.xml
  
  Index: intro.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-taglibs/scrape/xml/intro.xml,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- intro.xml	12 Mar 2004 04:14:12 -0000	1.3
  +++ intro.xml	24 Mar 2004 04:18:17 -0000	1.4
  @@ -70,6 +70,7 @@
     <ul>
     <li>Rich Catlett</li>
     <li>Glenn Nielsen</li>
  +  <li>Felipe Leme</li>
     </ul>
   
     </section>
  
  
  
  1.6       +16 -0     jakarta-taglibs/scrape/xml/scrape.xml
  
  Index: scrape.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-taglibs/scrape/xml/scrape.xml,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- scrape.xml	12 Mar 2004 04:14:12 -0000	1.5
  +++ scrape.xml	24 Mar 2004 04:18:17 -0000	1.6
  @@ -206,6 +206,16 @@
           <availability>1.0</availability>
         </attribute>
   
  +      <attribute>
  +        <name>charset</name>
  +        <required>no</required>
  +        <rtexprvalue>no</rtexprvalue>
  +        <description>
  +          Charset used by the scraped page. This attribute is useful when the page being scrapped uses a different charset than the web server.
  +	</description>
  +        <availability>1.0</availability>
  +      </attribute>
  +
         <example>
           <usage>
             <comment>
  @@ -539,6 +549,12 @@
     </tagtoc>
   
   </taglib>
  +
  +<revision release="Nightly build" date="04/24/2004">
  +  <description>
  +    Added <code>charset</code> attribute (see bug 24611)
  +  </description>
  +</revision>
   
   <revision release="Pre Beta" date="07/22/2001">
     <description>
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: taglibs-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: taglibs-dev-help@jakarta.apache.org