You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@flink.apache.org by "Niels Basjes (JIRA)" <ji...@apache.org> on 2016/08/25 14:04:21 UTC

[jira] [Commented] (FLINK-4485) Finished jobs in yarn session fill /tmp filesystem

    [ https://issues.apache.org/jira/browse/FLINK-4485?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15436917#comment-15436917 ] 

Niels Basjes commented on FLINK-4485:
-------------------------------------

I just reproduced the effect on a non-secure Yarn cluster.
After having run a few jobs I see this on the node where the jobmanager runs:

{code}
[root@node1 ~]# lsof | fgrep '/tmp/blobStore'
java      15358          yarn  mem       REG                8,3  70243224   25936270 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
java      15358          yarn  DEL       REG                8,3             25936269 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000027
java      15358          yarn  DEL       REG                8,3             25936268 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000026
java      15358          yarn  DEL       REG                8,3             25936267 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000025
java      15358          yarn  DEL       REG                8,3             25936266 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000024
java      15358          yarn  DEL       REG                8,3             25936265 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000023
java      15358          yarn  DEL       REG                8,3             25936264 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000022
java      15358          yarn  DEL       REG                8,3             25936263 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000021
java      15358          yarn  DEL       REG                8,3             25936258 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000020
java      15358          yarn  DEL       REG                8,3             25936257 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000019
java      15358          yarn  DEL       REG                8,3             25936260 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000018
java      15358          yarn  DEL       REG                8,3             25936259 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000017
java      15358          yarn  DEL       REG                8,3             25936256 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000016
java      15358          yarn  DEL       REG                8,3             25936255 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000015
java      15358          yarn  DEL       REG                8,3             25936254 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000014
java      15358          yarn  DEL       REG                8,3             25936253 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000013
java      15358          yarn  DEL       REG                8,3             25936252 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000012
java      15358          yarn  DEL       REG                8,3             25936251 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000011
java      15358          yarn  DEL       REG                8,3             25936250 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000010
java      15358          yarn  DEL       REG                8,3             25936249 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000009
java      15358          yarn  DEL       REG                8,3             25936248 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000008
java      15358          yarn  DEL       REG                8,3             25936247 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000007
java      15358          yarn  DEL       REG                8,3             25936246 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000006
java      15358          yarn  DEL       REG                8,3             25936244 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000005
java      15358          yarn  DEL       REG                8,3             25936222 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000004
java      15358          yarn  DEL       REG                8,3             25936221 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000003
java      15358          yarn  DEL       REG                8,3             25936220 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000002
java      15358          yarn  DEL       REG                8,3             25936215 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000001
java      15358          yarn  422r      REG                8,3  70243224   25936222 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000004 (deleted)
java      15358          yarn  581u      REG                8,3  70243224   25936265 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000023 (deleted)
java      15358          yarn  582u      REG                8,3  70243224   25936267 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000025 (deleted)
java      15358          yarn  583r      REG                8,3  70243224   25936246 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000006 (deleted)
java      15358          yarn  584r      REG                8,3  70243224   25936215 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000001 (deleted)
java      15358          yarn  590u      REG                8,3  70243224   25936266 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000024 (deleted)
java      15358          yarn  591r      REG                8,3  70243224   25936220 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000002 (deleted)
java      15358          yarn  593r      REG                8,3  70243224   25936221 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000003 (deleted)
java      15358          yarn  594u      REG                8,3  70243224   25936268 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000026 (deleted)
java      15358          yarn  595u      REG                8,3  70243224   25936270 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
java      15358          yarn  597r      REG                8,3  70243224   25936255 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000015 (deleted)
java      15358          yarn  598u      REG                8,3  70243224   25936269 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000027 (deleted)
java      15358          yarn  599r      REG                8,3  70243224   25936252 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000012 (deleted)
java      15358          yarn  600r      REG                8,3  70243224   25936250 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000010 (deleted)
java      15358          yarn  601r      REG                8,3  70243224   25936254 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000014 (deleted)
java      15358          yarn  602r      REG                8,3  70243224   25936244 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000005 (deleted)
java      15358          yarn  603r      REG                8,3  70243224   25936259 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000017 (deleted)
java      15358          yarn  604r      REG                8,3  70243224   25936248 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000008 (deleted)
java      15358          yarn  605r      REG                8,3  70243224   25936260 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000018 (deleted)
java      15358          yarn  607r      REG                8,3  70243224   25936257 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000019 (deleted)
java      15358          yarn  608r      REG                8,3  70243224   25936258 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000020 (deleted)
java      15358          yarn  609r      REG                8,3  70243224   25936263 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000021 (deleted)
java      15358          yarn  610r      REG                8,3  70243224   25936264 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000022 (deleted)
java      15358          yarn  613r      REG                8,3  70243224   25936247 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000007 (deleted)
java      15358          yarn  617r      REG                8,3  70243224   25936253 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000013 (deleted)
java      15358          yarn  618r      REG                8,3  70243224   25936251 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000011 (deleted)
java      15358          yarn  619r      REG                8,3  70243224   25936249 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000009 (deleted)
java      15358          yarn  631r      REG                8,3  70243224   25936256 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000016 (deleted)
java      15454          yarn  mem       REG                8,3  70243224   25936219 /tmp/blobStore-087a0b08-ee59-4d21-8523-c78a79984a4a/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
java      15454          yarn  490r      REG                8,3  70243224   25936219 /tmp/blobStore-087a0b08-ee59-4d21-8523-c78a79984a4a/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
{code}

The two process ids you see here are:
{code}yarn     15358  4.9  0.3 1362160 431128 ?      Sl   15:24   1:52  |       \_ /usr/lib/jvm/jre/bin/java -Xmx424M -Dlog.file=/var/log/hadoop-yarn/containers/application_1464009968005_2639/container_1464009968005_2639_01_000001/jobmanager.log -Dlogback.configurationFile=file:logback.xml -Dlog4j.configuration=file:log4j.properties org.apache.flink.yarn.YarnApplicationMasterRunner{code}

{code}yarn     15454 10.1  0.6 1306404 801228 ?      Sl   15:24   3:51          \_ /usr/lib/jvm/jre/bin/java -Xms424m -Xmx424m -XX:MaxDirectMemorySize=424m -Dlog.file=/var/log/hadoop-yarn/containers/application_1464009968005_2639/container_1464009968005_2639_01_000002/taskmanager.log -Dlogback.configurationFile=file:./logback.xml -Dlog4j.configuration=file:./log4j.properties org.apache.flink.yarn.YarnTaskManager --configDir .{code}





> Finished jobs in yarn session fill /tmp filesystem
> --------------------------------------------------
>
>                 Key: FLINK-4485
>                 URL: https://issues.apache.org/jira/browse/FLINK-4485
>             Project: Flink
>          Issue Type: Bug
>          Components: JobManager
>    Affects Versions: 1.1.0
>            Reporter: Niels Basjes
>            Priority: Blocker
>
> On a Yarn cluster I start a yarn-session with a few containers and task slots.
> Then I fire a 'large' number of Flink batch jobs in sequence against this yarn session. It is the exact same job (java code) yet it gets different parameters.
> In this scenario it is exporting HBase tables to files in HDFS and the parameters are about which data from which tables and the name of the target directory.
> After running several dozen jobs the jobs submission started to fail and we investigated.
> We found that the cause was that on the Yarn node which was hosting the jobmanager the /tmp file system was full (4GB was 100% full).
> How ever the output of {{du -hcs /tmp}} showed only 200MB in use.
> We found that a very large file (we guess it is the jar of the job) was put in /tmp , used, deleted yet the file handle was not closed by the jobmanager.
> As soon as we killed the jobmanager the disk space was freed.
> The summary of the impact of this is that a yarn-session that receives enough jobs brings down the Yarn node for all users.
> See parts of the output we got from {{lsof}} below.
> {code}
> COMMAND     PID      USER   FD      TYPE             DEVICE      SIZE       NODE NAME
> java      15034   nbasjes  550r      REG             253,17  66219695        245 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000003 (deleted)
> java      15034   nbasjes  551r      REG             253,17  66219695        252 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000007 (deleted)
> java      15034   nbasjes  552r      REG             253,17  66219695        267 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000012 (deleted)
> java      15034   nbasjes  553r      REG             253,17  66219695        250 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000005 (deleted)
> java      15034   nbasjes  554r      REG             253,17  66219695        288 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000018 (deleted)
> java      15034   nbasjes  555r      REG             253,17  66219695        298 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000025 (deleted)
> java      15034   nbasjes  557r      REG             253,17  66219695        254 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000008 (deleted)
> java      15034   nbasjes  558r      REG             253,17  66219695        292 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000019 (deleted)
> java      15034   nbasjes  559r      REG             253,17  66219695        275 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000013 (deleted)
> java      15034   nbasjes  560r      REG             253,17  66219695        159 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000002 (deleted)
> java      15034   nbasjes  562r      REG             253,17  66219695        238 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000001 (deleted)
> java      15034   nbasjes  568r      REG             253,17  66219695        246 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000004 (deleted)
> java      15034   nbasjes  569r      REG             253,17  66219695        255 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000009 (deleted)
> java      15034   nbasjes  571r      REG             253,17  66219695        299 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000026 (deleted)
> java      15034   nbasjes  572r      REG             253,17  66219695        293 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000020 (deleted)
> java      15034   nbasjes  574r      REG             253,17  66219695        256 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000010 (deleted)
> java      15034   nbasjes  575r      REG             253,17  66219695        302 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000029 (deleted)
> java      15034   nbasjes  576r      REG             253,17  66219695        294 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000021 (deleted)
> java      15034   nbasjes  577r      REG             253,17  66219695        262 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000011 (deleted)
> java      15034   nbasjes  578r      REG             253,17  66219695        251 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000006 (deleted)
> java      15034   nbasjes  580r      REG             253,17  66219695        295 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000022 (deleted)
> java      15034   nbasjes  581r      REG             253,17  66219695        300 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000027 (deleted)
> java      15034   nbasjes  582r      REG             253,17  66219695        188 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/cache/blob_e318d1698aa6e7dc91e5f4a9f8ba29781aebd8c4 (deleted)
> java      15034   nbasjes  585r      REG             253,17  66219695        279 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000014 (deleted)
> java      15034   nbasjes  586r      REG             253,17  66219695        296 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000023 (deleted)
> java      15034   nbasjes  588r      REG             253,17  66219695        301 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000028 (deleted)
> java      15034   nbasjes  589r      REG             253,17  66219695        297 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000024 (deleted)
> java      15034   nbasjes  598r      REG             253,17  66219695        280 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000015 (deleted)
> java      15034   nbasjes  601r      REG             253,17  66219695        289 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000016 (deleted)
> java      15034   nbasjes  604r      REG             253,17  66219695        284 /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000017 (deleted)
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)