You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-dev@hadoop.apache.org by Guo Leitao <le...@gmail.com> on 2013/12/20 14:32:56 UTC

Urgent help of HDFS editlog load error

we are using CDH3U4, and secondary namenode can not work for 8 months, but
we do not know.

HDFS works fine, but just now I restart the HDFS, it can not work.

Logs are the following, please help me :

 bin/hadoop namenode
13/12/20 21:29:11 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = NFJD-PSC-HP-SV001/172.17.0.4
STARTUP_MSG:   args = []
STARTUP_MSG:   version = 0.20.2-cdh3u4
STARTUP_MSG:   build =
git://ubuntu-slave01/var/lib/jenkins/workspace/CDH3u4-Full-RC/build/cdh3/hadoop20/0.20.2-cdh3u4/source
-r 214dd731e3bdb687cb55988d3f47dd9e248c5690; compiled by 'jenkins' on Mon
May  7 13:01:39 PDT 2012
************************************************************/
13/12/20 21:29:11 INFO jvm.JvmMetrics: Initializing JVM Metrics with
processName=NameNode, sessionId=null
13/12/20 21:29:11 INFO metrics.NameNodeMetrics: Initializing
NameNodeMeterics using context
object:org.apache.hadoop.metrics.spi.NullContext
13/12/20 21:29:11 INFO util.GSet: VM type       = 64-bit
13/12/20 21:29:11 INFO util.GSet: 2% max memory = 182.045 MB
13/12/20 21:29:11 INFO util.GSet: capacity      = 2^25 = 33554432 entries
13/12/20 21:29:11 INFO util.GSet: recommended=33554432, actual=33554432
13/12/20 21:29:11 INFO namenode.FSNamesystem: fsOwner=onest (auth:SIMPLE)
13/12/20 21:29:11 INFO namenode.FSNamesystem: supergroup=supergroup
13/12/20 21:29:11 INFO namenode.FSNamesystem: isPermissionEnabled=false
13/12/20 21:29:11 INFO namenode.FSNamesystem:
dfs.block.invalidate.limit=1000
13/12/20 21:29:11 WARN namenode.FSNamesystem: The dfs.support.append option
is in your configuration, however append is not supported. This
configuration option is no longer required to enable sync.
13/12/20 21:29:11 INFO namenode.FSNamesystem: isAccessTokenEnabled=false
accessKeyUpdateInterval=0 min(s), accessTokenLifetime=0 min(s)
13/12/20 21:29:12 INFO metrics.FSNamesystemMetrics: Initializing
FSNamesystemMetrics using context
object:org.apache.hadoop.metrics.spi.NullContext
13/12/20 21:29:12 INFO common.Storage: Number of files = 67471
13/12/20 21:29:14 INFO common.Storage: Number of files under construction =
60
13/12/20 21:29:14 INFO common.Storage: Image file of size 12900908 loaded
in 2 seconds.
...
...

13/12/20 21:29:14 ERROR common.Storage: Error replaying edit log at offset
104928
Recent opcode offsets: 104355 104524 104693 104702
java.lang.NullPointerException
        at
org.apache.hadoop.hdfs.server.namenode.FSDirectory.addChild(FSDirectory.java:1099)
        at
org.apache.hadoop.hdfs.server.namenode.FSDirectory.addChild(FSDirectory.java:1111)
        at
org.apache.hadoop.hdfs.server.namenode.FSDirectory.addNode(FSDirectory.java:1014)
        at
org.apache.hadoop.hdfs.server.namenode.FSDirectory.unprotectedAddFile(FSDirectory.java:217)
        at
org.apache.hadoop.hdfs.server.namenode.FSEditLog.loadFSEdits(FSEditLog.java:693)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSEdits(FSImage.java:1043)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:853)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:383)
        at
org.apache.hadoop.hdfs.server.namenode.FSDirectory.loadFSImage(FSDirectory.java:110)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.initialize(FSNamesystem.java:372)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.<init>(FSNamesystem.java:335)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:271)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:467)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1330)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1339)
13/12/20 21:29:14 ERROR namenode.FSNamesystem: FSNamesystem initialization
failed.
java.io.IOException: Error replaying edit log at offset 104928
Recent opcode offsets: 104355 104524 104693 104702
        at
org.apache.hadoop.hdfs.server.namenode.MetaRecoveryContext.editLogLoaderPrompt(MetaRecoveryContext.java:84)
        at
org.apache.hadoop.hdfs.server.namenode.FSEditLog.loadFSEdits(FSEditLog.java:936)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSEdits(FSImage.java:1043)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:853)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:383)
        at
org.apache.hadoop.hdfs.server.namenode.FSDirectory.loadFSImage(FSDirectory.java:110)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.initialize(FSNamesystem.java:372)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.<init>(FSNamesystem.java:335)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:271)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:467)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1330)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1339)
13/12/20 21:29:14 ERROR namenode.NameNode: java.io.IOException: Error
replaying edit log at offset 104928
Recent opcode offsets: 104355 104524 104693 104702
        at
org.apache.hadoop.hdfs.server.namenode.MetaRecoveryContext.editLogLoaderPrompt(MetaRecoveryContext.java:84)
        at
org.apache.hadoop.hdfs.server.namenode.FSEditLog.loadFSEdits(FSEditLog.java:936)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSEdits(FSImage.java:1043)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:853)
        at
org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:383)
        at
org.apache.hadoop.hdfs.server.namenode.FSDirectory.loadFSImage(FSDirectory.java:110)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.initialize(FSNamesystem.java:372)
        at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.<init>(FSNamesystem.java:335)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:271)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:467)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1330)
        at
org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1339)

13/12/20 21:29:14 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at NFJD-PSC-HP-SV001/172.17.0.4

Re: Urgent help of HDFS editlog load error

Posted by Guo Leitao <le...@gmail.com>.
we debugged the code and added 'try-catch' around the following codes in
FSEditLog.java to skip error inodefiles.



          INodeFile node = null;

          try {

            node =

                (INodeFile) fsDir.unprotectedAddFile(path, permissions,
blocks,

                    replication, mtime, atime, blockSize);

          } catch (Exception e) {

            FSNamesystem.LOG.warn("INode error, " + e);

            break;

          }



Finally, we skiped 6791 operations, most of them are rename and setTimes,
and lost log data for 1 day, but hbase's data are not corrupted. The 6.3GB
edits.new file has been replayed into the 14MB fsimage.



2013-12-21 09:02:31,525 INFO
org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Invalid opcode,
reached end of edit log Number of transactions found: 55580653.  Byte

s read: 6749947652

2013-12-21 09:02:31,526 INFO org.apache.hadoop.hdfs.server.common.Storage:
Edits file /data0/onest3/dfs/name/current/edits.new of size 6750636687
edits # 55580653 load

ed in 705 seconds.

2013-12-21 09:02:32,355 INFO org.apache.hadoop.hdfs.server.common.Storage:
Image file of size 14802891 saved in 0 seconds.

2013-12-21 09:02:32,455 INFO
org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Finished loading
FSImage in 710020 msecs



-Leitao




2013/12/20 Guo Leitao <le...@gmail.com>

> we are using CDH3U4, and secondary namenode can not work for 8 months, but
> we do not know.
>
> HDFS works fine, but just now I restart the HDFS, it can not work.
>
> Logs are the following, please help me :
>
>  bin/hadoop namenode
> 13/12/20 21:29:11 INFO namenode.NameNode: STARTUP_MSG:
> /************************************************************
> STARTUP_MSG: Starting NameNode
> STARTUP_MSG:   host = NFJD-PSC-HP-SV001/172.17.0.4
> STARTUP_MSG:   args = []
> STARTUP_MSG:   version = 0.20.2-cdh3u4
> STARTUP_MSG:   build =
> git://ubuntu-slave01/var/lib/jenkins/workspace/CDH3u4-Full-RC/build/cdh3/hadoop20/0.20.2-cdh3u4/source
> -r 214dd731e3bdb687cb55988d3f47dd9e248c5690; compiled by 'jenkins' on Mon
> May  7 13:01:39 PDT 2012
> ************************************************************/
> 13/12/20 21:29:11 INFO jvm.JvmMetrics: Initializing JVM Metrics with
> processName=NameNode, sessionId=null
> 13/12/20 21:29:11 INFO metrics.NameNodeMetrics: Initializing
> NameNodeMeterics using context
> object:org.apache.hadoop.metrics.spi.NullContext
> 13/12/20 21:29:11 INFO util.GSet: VM type       = 64-bit
> 13/12/20 21:29:11 INFO util.GSet: 2% max memory = 182.045 MB
> 13/12/20 21:29:11 INFO util.GSet: capacity      = 2^25 = 33554432 entries
> 13/12/20 21:29:11 INFO util.GSet: recommended=33554432, actual=33554432
> 13/12/20 21:29:11 INFO namenode.FSNamesystem: fsOwner=onest (auth:SIMPLE)
> 13/12/20 21:29:11 INFO namenode.FSNamesystem: supergroup=supergroup
> 13/12/20 21:29:11 INFO namenode.FSNamesystem: isPermissionEnabled=false
> 13/12/20 21:29:11 INFO namenode.FSNamesystem:
> dfs.block.invalidate.limit=1000
> 13/12/20 21:29:11 WARN namenode.FSNamesystem: The dfs.support.append
> option is in your configuration, however append is not supported. This
> configuration option is no longer required to enable sync.
> 13/12/20 21:29:11 INFO namenode.FSNamesystem: isAccessTokenEnabled=false
> accessKeyUpdateInterval=0 min(s), accessTokenLifetime=0 min(s)
> 13/12/20 21:29:12 INFO metrics.FSNamesystemMetrics: Initializing
> FSNamesystemMetrics using context
> object:org.apache.hadoop.metrics.spi.NullContext
> 13/12/20 21:29:12 INFO common.Storage: Number of files = 67471
> 13/12/20 21:29:14 INFO common.Storage: Number of files under construction
> = 60
> 13/12/20 21:29:14 INFO common.Storage: Image file of size 12900908 loaded
> in 2 seconds.
> ...
> ...
>
> 13/12/20 21:29:14 ERROR common.Storage: Error replaying edit log at offset
> 104928
> Recent opcode offsets: 104355 104524 104693 104702
> java.lang.NullPointerException
>         at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.addChild(FSDirectory.java:1099)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.addChild(FSDirectory.java:1111)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.addNode(FSDirectory.java:1014)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.unprotectedAddFile(FSDirectory.java:217)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSEditLog.loadFSEdits(FSEditLog.java:693)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSEdits(FSImage.java:1043)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:853)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:383)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.loadFSImage(FSDirectory.java:110)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.initialize(FSNamesystem.java:372)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.<init>(FSNamesystem.java:335)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:271)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:467)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1330)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1339)
> 13/12/20 21:29:14 ERROR namenode.FSNamesystem: FSNamesystem initialization
> failed.
> java.io.IOException: Error replaying edit log at offset 104928
> Recent opcode offsets: 104355 104524 104693 104702
>         at
> org.apache.hadoop.hdfs.server.namenode.MetaRecoveryContext.editLogLoaderPrompt(MetaRecoveryContext.java:84)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSEditLog.loadFSEdits(FSEditLog.java:936)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSEdits(FSImage.java:1043)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:853)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:383)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.loadFSImage(FSDirectory.java:110)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.initialize(FSNamesystem.java:372)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.<init>(FSNamesystem.java:335)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:271)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:467)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1330)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1339)
> 13/12/20 21:29:14 ERROR namenode.NameNode: java.io.IOException: Error
> replaying edit log at offset 104928
> Recent opcode offsets: 104355 104524 104693 104702
>         at
> org.apache.hadoop.hdfs.server.namenode.MetaRecoveryContext.editLogLoaderPrompt(MetaRecoveryContext.java:84)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSEditLog.loadFSEdits(FSEditLog.java:936)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSEdits(FSImage.java:1043)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:853)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:383)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSDirectory.loadFSImage(FSDirectory.java:110)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.initialize(FSNamesystem.java:372)
>         at
> org.apache.hadoop.hdfs.server.namenode.FSNamesystem.<init>(FSNamesystem.java:335)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:271)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:467)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1330)
>         at
> org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1339)
>
> 13/12/20 21:29:14 INFO namenode.NameNode: SHUTDOWN_MSG:
> /************************************************************
> SHUTDOWN_MSG: Shutting down NameNode at NFJD-PSC-HP-SV001/172.17.0.4
>