-
Notifications
You must be signed in to change notification settings - Fork 61
Description
报错日志:
[W416 15:37:25.363094293 socket.cpp:933] [c10d] The server socket on [::ffff:36.xxx.xxx.13]:40613 has timed out, will retry.
[W416 15:39:40.531073896 socket.cpp:933] [c10d] The server socket on [::ffff:36.xxx.xxx.13]:40613 has timed out, will retry.
[E416 15:40:10.861069241 socket.cpp:1023] [c10d] The client socket has timed out after 300000ms while trying to connect to (mo.sy.cn, 40613).
12/site-packages/torch/lib/libtorch_cpu.so)
frame #7: + 0xe48b73 (0x7b4d37248b73 in /home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
frame #8: + 0x51a21a (0x7b4d3691a21a in /home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
frame #9: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x54d2d4]
frame #10: _PyObject_MakeTpCall + 0x2fb (0x51e38b in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #11: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x578dbd]
frame #12: _PyObject_Call + 0x122 (0x55e162 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #13: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x55b227]
frame #14: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x51eaab]
frame #15: + 0x5188eb (0x7b4d369188eb in /home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
frame #16: _PyObject_MakeTpCall + 0x2fb (0x51e38b in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #17: _PyEval_EvalFrameDefault + 0x6ce (0x528ede in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #18: _PyObject_FastCallDictTstate + 0x1e7 (0x520f07 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #19: _PyObject_Call_Prepend + 0xe0 (0x55b540 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #20: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x630ff6]
frame #21: _PyObject_Call + 0xb5 (0x55e0f5 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x503a (0x52d84a in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #23: PyEval_EvalCode + 0xae (0x5e581e in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #24: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x60bfd7]
frame #25: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x6071c7]
frame #26: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x61f452]
frame #27: _PyRun_SimpleFileObject + 0x1b0 (0x61ed90 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #28: _PyRun_AnyFileObject + 0x43 (0x61eb83 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #29: Py_RunMain + 0x303 (0x617c93 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #30: Py_BytesMain + 0x39 (0x5d03c9 in /home/xkxk/anaconda3/envs/llmc/bin/python)
frame #31: + 0x2a1ca (0x7b4d4082a1ca in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #32: __libc_start_main + 0x8b (0x7b4d4082a28b in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #33: /home/xkxk/anaconda3/envs/llmc/bin/python() [0x5d01f9]
Traceback (most recent call last):
File "/home/xkxk/anaconda3/envs/llmc/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/run.py", line 918, in main
run(args)
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/run.py", line 909, in run
elastic_launch(
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 138, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 260, in launch_agent
result = agent.run()
^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run
result = self._invoke_run(role)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run
self._initialize_workers(self._worker_group)
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 683, in _initialize_workers
self._rendezvous(worker_group)
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
rdzv_info = spec.rdzv_handler.next_rendezvous()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1202, in next_rendezvous
self._shared_tcp_store_server = self._create_tcp_store_server(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/xkxk/anaconda3/envs/llmc/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1118, in _create_tcp_store_server
return dist.TCPStore(
^^^^^^^^^^^^^^
torch.distributed.DistNetworkError: The client socket has timed out after 300000ms while trying to connect to (mo.sy.cn, 40613).
运行脚本
llmc=/data/app/llmc
export PYTHONPATH=$llmc:$PYTHONPATH
task_name=smoothquant_w_a
config=${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml
export CUDA_VISIBLE_DEVICES=2,3
export OMP_NUM_THREADS=1
nnodes=1
#nproc_per_node=1
nproc_per_node=2
find_unused_port() {
while true; do
port=$(shuf -i 10000-60000 -n 1)
if ! ss -tuln | grep -q ":$port "; then
echo "$port"
return 0
fi
done
}
UNUSED_PORT=$(find_unused_port)
MASTER_ADDR=127.0.0.1
MASTER_PORT=$UNUSED_PORT
task_id=$UNUSED_PORT
#nohup
torchrun
--nnodes $nnodes
--nproc_per_node $nproc_per_node
--rdzv_id $task_id
--rdzv_backend c10d
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT
${llmc}/llmc/main.py --config $config --task_id $task_id
#> ${task_name}.log 2>&1 | tee ${task_name}.log &
2>&1 | tee ${task_name}.log &