Hi all,
I'm trying to virtualize a Java based application using the OpenVZ framework. Unfortunately I'm running into a strange max number of process and / or threads problem I'm currently unable to explain. Since I've ran out of ideas where to search for potential bottlenecks I'd like to ask for some advice.
The machine (CentOS 5.4, 64bit, 64GB Ram, 2x Xeon X5450) is running ten containers as vzlist shows:
[root@office-vm01]# vzlist
CTID NPROC STATUS IP_ADDR HOSTNAME
4001 15 running 192.168.3.230 vdev-app01
4002 15 running 192.168.3.234 vdev-app02
4003 15 running 192.168.3.238 vdev-app03
4004 15 running 192.168.3.242 vdev-app04
4005 15 running 192.168.3.246 vdev-app05
4006 15 running 192.168.3.210 vdev-app06
4007 15 running 192.168.3.214 vdev-app07
4008 15 running 192.168.3.218 vdev-app08
4009 15 running 192.168.3.222 vdev-app09
4010 15 running 192.168.3.226 vdev-app10
Also /proc/user_beancounter is happy after the creation of the containers using the provided CentOS template of www.openvz.org:
4001: kmemsize 3548877 4540828 62914560000 69206016000 0
lockedpages 0 9333 16384 16384 0
privvmpages 8982 21381 2097152 2097152 0
shmpages 31 31 21504 21504 0
dummy 0 0 0 0 0
numproc 15 20 16384 16384 0
physpages 4581 11954 0 9223372036854775807 0
vmguarpages 0 0 1572864 9223372036854775807 0
oomguarpages 4581 11954 26112 9223372036854775807 0
numtcpsock 7 8 2048 2048 0
numflock 5 9 188 206 0
numpty 0 0 16 16 0
numsiginfo 0 3 256 256 0
tcpsndbuf 122304 0 3440640 8719680 0
tcprcvbuf 114688 754592 3440640 8719680 0
othersockbuf 11600 26160 3440640 8719680 0
dgramrcvbuf 0 8464 262144 262144 0
numothersock 13 21 360 360 0
dcachesize 0 0 71516160 71516160 0
numfile 499 635 186240 186240 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
numiptent 20 20 128 128 0
4006: kmemsize 3544424 4458820 62914560000 69206016000 0
lockedpages 0 9333 16384 16384 0
privvmpages 9622 21228 2097152 2097152 0
shmpages 31 31 21504 21504 0
dummy 0 0 0 0 0
numproc 15 20 16384 16384 0
physpages 4584 11954 0 9223372036854775807 0
vmguarpages 0 0 1572864 9223372036854775807 0
oomguarpages 4584 11954 26112 9223372036854775807 0
numtcpsock 7 8 2048 2048 0
numflock 5 9 188 206 0
numpty 0 0 16 16 0
numsiginfo 0 3 256 256 0
tcpsndbuf 122304 0 3440640 8719680 0
tcprcvbuf 114688 875600 3440640 8719680 0
othersockbuf 11600 25376 3440640 8719680 0
dgramrcvbuf 0 8464 262144 262144 0
numothersock 13 21 360 360 0
dcachesize 0 0 71516160 71516160 0
numfile 499 640 186240 186240 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
numiptent 20 20 128 128 0
The limitations where changed to match the needs of the Java application which will need ~5,5GB ram so that the guaranteed memory was set to ~6GB and the dynamic ram to ~8GB.
I've picked container 4001 and 4006 on purpose since 4006 will be the container I'll run into the process and / or thread limit later. Here's the configuration file of container 4001:
[root@office-vm01 openVZ]# cat /etc/vz/conf/4001.conf
ONBOOT="yes"
# UBC parameters (in form of barrier:limit)
KMEMSIZE="62914560000:69206016000"
LOCKEDPAGES="16384:16384"
PRIVVMPAGES="2097152:2097152"
SHMPAGES="21504:21504"
NUMPROC="16384:16384"
PHYSPAGES="0:9223372036854775807"
VMGUARPAGES="1572864:9223372036854775807"
OOMGUARPAGES="26112:9223372036854775807"
NUMTCPSOCK="2048:2048"
NUMFLOCK="188:206"
NUMPTY="16:16"
NUMSIGINFO="256:256"
TCPSNDBUF="3440640:8719680"
TCPRCVBUF="3440640:8719680"
OTHERSOCKBUF="3440640:8719680"
DGRAMRCVBUF="262144:262144"
NUMOTHERSOCK="360:360"
DCACHESIZE="71516160:71516160"
NUMFILE="186240:186240"
AVNUMPROC="180:180"
NUMIPTENT="128:128"
# Disk quota parameters (in form of softlimit:hardlimit)
DISKSPACE="20971520:20971520"
DISKINODES="200000:220000"
QUOTATIME="0"
# CPU fair sheduler parameter
CPUUNITS="10000"
IP_ADDRESS="192.168.3.230 192.168.3.231 192.168.3.232 192.168.3.233"
HOSTNAME="vdev-app01"
VE_ROOT="/vz/root/$VEID"
VE_PRIVATE="/vz/private/$VEID"
OSTEMPLATE="centos-5.4-x86_64"
ORIGIN_SAMPLE="vps.basic"
NAME="vdev01"
NAMESERVER="192.168.0.227 192.168.0.236"
NOATIME="yes"
Container 4006 has the exact same configuration expect the IP_ADDRESS variable. After I've deployed the Java application to all ten containers I have sometimes a failure count in the 'othersockbuf' that I can't explain since the 'maxheld' value was never as high as the 'barrier' and / or the 'limit' value:
4001: kmemsize 3614323 5913272 62914560000 69206016000 0
lockedpages 0 9333 16384 16384 0
privvmpages 8987 21381 2097152 2097152 0
shmpages 31 687 21504 21504 0
dummy 0 0 0 0 0
numproc 15 22 16384 16384 0
physpages 4599 11954 0 9223372036854775807 0
vmguarpages 0 0 1572864 9223372036854775807 0
oomguarpages 4599 11954 26112 9223372036854775807 0
numtcpsock 7 11 2048 2048 0
numflock 5 9 188 206 0
numpty 0 0 16 16 0
numsiginfo 0 3 256 256 0
tcpsndbuf 122304 0 3440640 8719680 0
tcprcvbuf 114688 754592 3440640 8719680 0
othersockbuf 11600 1083344 3440640 8719680 0
dgramrcvbuf 0 8464 262144 262144 0
numothersock 13 29 360 360 0
dcachesize 0 0 71516160 71516160 0
numfile 511 697 186240 186240 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
numiptent 20 20 128 128 0
4006: kmemsize 3572403 5434118 62914560000 69206016000 0
lockedpages 0 9333 16384 16384 0
privvmpages 9627 21228 2097152 2097152 0
shmpages 31 687 21504 21504 0
dummy 0 0 0 0 0
numproc 15 22 16384 16384 0
physpages 4603 11954 0 9223372036854775807 0
vmguarpages 0 0 1572864 9223372036854775807 0
oomguarpages 4603 11954 26112 9223372036854775807 0
numtcpsock 7 11 2048 2048 0
numflock 5 9 188 206 0
numpty 0 0 16 16 0
numsiginfo 0 3 256 256 0
tcpsndbuf 122304 0 3440640 8719680 0
tcprcvbuf 114688 875600 3440640 8719680 0
othersockbuf 11600 2265168 3440640 8719680 1
dgramrcvbuf 0 8464 262144 262144 0
numothersock 13 29 360 360 0
dcachesize 0 0 71516160 71516160 0
numfile 512 699 186240 186240 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
dummy 0 0 0 0 0
...