You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by 高睿 <ga...@163.com> on 2013/02/28 06:19:13 UTC

Hsql occupy so much memory with Nutch

Hi,

I set up Nutch 2.1 with Hsql 2.2.9. It works fine. The only problem now is hsql occupys more and more memory even there are only 6233 records in table 'webpage'.
I had changed the size of column 'text' and 'content' from 65536 to 1000000 since there was an truncation error. I also have a plugin to extract article content from html and assign the content to 'content', but I can't see it impacts the size of the webpage.

Do you guys have any experience about this?

Early, I tried to integrate Nutch 2.1 with Mysql, but failed with lots of exceptions. I don't know if there are some fix about this. Thanks.

sql> select count(*) from webpage;
6233

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                            
 1358 root      20   0 1562m 527m 1912 S  0.0 31.9   4:01.87 java                                

# ps -ef | grep nutch
root      1358     1  0 Feb27 ?        00:04:01 java -cp lib/hsqldb.jar org.hsqldb.server.Server -port 544 --database.0 file:/data/hsqldb-2.2.9/nutch_production --dbname.0 nutch_production

[root@ip-10-150-115-207 hsqldb]# cat ../../apache-nutch-2.1/conf/gora-sql-mapping.xml
<?xml version="1.0" encoding="UTF-8"?>
<gora-orm>

<class name="org.apache.nutch.storage.WebPage" keyClass="java.lang.String" table="webpage">
  <primarykey column="id" length="512"/>
    <field name="baseUrl" column="baseUrl" length="512"/>   
    <field name="status" column="status"/>
    <field name="prevFetchTime" column="prevFetchTime"/>
    <field name="fetchTime" column="fetchTime"/>
    <field name="fetchInterval" column="fetchInterval"/>
    <field name="retriesSinceFetch" column="retriesSinceFetch"/>
    <field name="reprUrl" column="reprUrl" length="512"/>
    <field name="content" column="content" length="1000000"/>
    <field name="contentType" column="typ" length="32"/>   
    <field name="protocolStatus" column="protocolStatus"/>
    <field name="modifiedTime" column="modifiedTime"/>

    <!-- parse fields                                       -->
    <field name="title" column="title" length="512"/>
    <field name="text" column="text" length="1000000"/>
    <field name="parseStatus" column="parseStatus"/>
    <field name="signature" column="signature"/>
    <field name="prevSignature" column="prevSignature"/>

    <!-- score fields                                       -->
    <field name="score" column="score"/>
    <field name="headers" column="headers"/>
    <field name="inlinks" column="inlinks"/>
    <field name="outlinks" column="outlinks"/>
    <field name="metadata" column="metadata"/>
    <field name="markers" column="markers"/>
</class>

<class name="org.apache.nutch.storage.Host" keyClass="java.lang.String"
table="host">
  <primarykey column="id" length="512"/>
  <field name="metadata" column="metadata"/>
  <field name="inlinks" column="inlinks"/>
  <field name="outlinks" column="outlinks"/>
</class>

</gora-orm>