You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by 高睿 <ga...@163.com> on 2013/02/28 06:19:13 UTC
Hsql occupy so much memory with Nutch
Hi,
I set up Nutch 2.1 with Hsql 2.2.9. It works fine. The only problem now is hsql occupys more and more memory even there are only 6233 records in table 'webpage'.
I had changed the size of column 'text' and 'content' from 65536 to 1000000 since there was an truncation error. I also have a plugin to extract article content from html and assign the content to 'content', but I can't see it impacts the size of the webpage.
Do you guys have any experience about this?
Early, I tried to integrate Nutch 2.1 with Mysql, but failed with lots of exceptions. I don't know if there are some fix about this. Thanks.
sql> select count(*) from webpage;
6233
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
1358 root 20 0 1562m 527m 1912 S 0.0 31.9 4:01.87 java
# ps -ef | grep nutch
root 1358 1 0 Feb27 ? 00:04:01 java -cp lib/hsqldb.jar org.hsqldb.server.Server -port 544 --database.0 file:/data/hsqldb-2.2.9/nutch_production --dbname.0 nutch_production
[root@ip-10-150-115-207 hsqldb]# cat ../../apache-nutch-2.1/conf/gora-sql-mapping.xml
<?xml version="1.0" encoding="UTF-8"?>
<gora-orm>
<class name="org.apache.nutch.storage.WebPage" keyClass="java.lang.String" table="webpage">
<primarykey column="id" length="512"/>
<field name="baseUrl" column="baseUrl" length="512"/>
<field name="status" column="status"/>
<field name="prevFetchTime" column="prevFetchTime"/>
<field name="fetchTime" column="fetchTime"/>
<field name="fetchInterval" column="fetchInterval"/>
<field name="retriesSinceFetch" column="retriesSinceFetch"/>
<field name="reprUrl" column="reprUrl" length="512"/>
<field name="content" column="content" length="1000000"/>
<field name="contentType" column="typ" length="32"/>
<field name="protocolStatus" column="protocolStatus"/>
<field name="modifiedTime" column="modifiedTime"/>
<!-- parse fields -->
<field name="title" column="title" length="512"/>
<field name="text" column="text" length="1000000"/>
<field name="parseStatus" column="parseStatus"/>
<field name="signature" column="signature"/>
<field name="prevSignature" column="prevSignature"/>
<!-- score fields -->
<field name="score" column="score"/>
<field name="headers" column="headers"/>
<field name="inlinks" column="inlinks"/>
<field name="outlinks" column="outlinks"/>
<field name="metadata" column="metadata"/>
<field name="markers" column="markers"/>
</class>
<class name="org.apache.nutch.storage.Host" keyClass="java.lang.String"
table="host">
<primarykey column="id" length="512"/>
<field name="metadata" column="metadata"/>
<field name="inlinks" column="inlinks"/>
<field name="outlinks" column="outlinks"/>
</class>
</gora-orm>