Загрузка данных
FROM nvcr.io/nvidia/pytorch:25.06-py3
# создаем пользователя с ID 1000 и группой с ID 1000
# cоздаем пустую директорию с правами user
RUN userdel ubuntu && useradd -u 1000 -m user --shell /bin/bash \
&& mkdir -p /home/jovyan && chown user:user /home/jovyan
# Ставим OpenSSH для MPI что бы соединятся между контейнирами
# Генерим ключи
RUN apt-get update \
&& apt-get install -y --no-install-recommends openssh-server\
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /var/run/sshd \
&& ssh-keygen -A
# Allow OpenSSH to talk to containers without asking for confirmation
# Disable OpenSSH PAM
# Set port 2222
# chmod 0600 /etc/ssh/ssh_host_*_key
RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config.d/mlspace.conf \
&& echo "UsePAM no" > /etc/ssh/sshd_config.d/mlspace.conf \
&& echo "PubkeyAcceptedAlgorithms +ssh-rsa" >> /etc/ssh/sshd_config.d/mlspace.conf \
&& echo "Port 2222" >> /etc/ssh/sshd_config.d/mlspace.conf \
&& echo "PidFile /run/sshd/sshd.pid" >> /etc/ssh/sshd_config.d/mlspace.conf \
&& chmod 600 /etc/ssh/ssh_host_* \
&& chown user:user /etc/ssh/ssh_host_* \
&& chown user:user /run/sshd
RUN apt-get update && apt-get install -y screen libaio-dev
# Clearing the apt caches
RUN apt-get clean
# Building and installing nccl tests for current container
RUN cd /workspace \
&& git clone https://github.com/NVIDIA/nccl-tests.git \
&& cd nccl-tests \
&& make MPI=1 MPI_HOME=/opt/hpcx/ompi/ CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr/lib/x86_64-linux-gnu/ \
&& mv build /opt/nccl-tests \
&& cd /workspace \
&& rm -rf nccl-tests
ARG NVSHMEM_DIR=/home/user/nvshmem
####
# Все необходимые действия от пользователя делаются в секции ниже под USER
# Тут важно помнить, что если делается экспорт переменных в секции выше, то пользователь
# в данной секции их не увидит, следовательно все должно располагаться тут
####
WORKDIR /home/user
USER root
ENV FLASH_ATTENTION_VERSION=2.7.2
RUN pip uninstall -y flash-attn
RUN cd /home/user && git clone -b v${FLASH_ATTENTION_VERSION} --single-branch --depth 1 https://github.com/Dao-AILab/flash-attention.git
RUN pip install packaging==24.1 setuptools==69.5.1
RUN cd /home/user/flash-attention && MAX_JOBS=32 python setup.py install
RUN cd /home/user/flash-attention/hopper && python setup.py install
WORKDIR /home/user/flash-attention
RUN cd csrc/layer_norm && pip install .
RUN cd csrc/fused_dense_lib && \
pip install .
WORKDIR /home/user
###############
# GDRCopy
###############
ARG GDRCOPY_HOME=/usr/local/gdrcopy
RUN apt-get update && apt-get install -y --no-install-recommends nvidia-dkms-570
RUN dpkg -r libgdrapi gdrcopy && \
wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz && \
tar -zxvf v2.5.tar.gz && cd gdrcopy-2.5 && \
make prefix=${GDRCOPY_HOME} install -j$(nproc) && \
cd .. && rm v2.5.tar.gz && rm -rf gdrcopy-2.5
###############
# DeepEP
###############
# can be "sm80" or "sm90"
# for sm80 FP8, launch methods, TMA (Tensor Memory Access), all internode and low-latency features from NVSHMEM are disabled
ARG CUDA_ARCH="sm_90"
ARG DEEPEP_COMMIT=174c209fec3f03db86139e2a74150b6bf3447332
COPY configs.patch /tmp/configs.patch
RUN git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && \
git checkout ${DEEPEP_COMMIT} && \
git apply /tmp/configs.patch && \
cd ..
# # NVSHMEM
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar xvf nvshmem_src_3.2.5-1.txz && \
cd nvshmem_src && git apply /home/user/DeepEP/third-party/nvshmem.patch && \
# sed -i for commenting warnings from ibgda about cpu fallbacks
sed -i '968,971 s/^/\/\/ /' src/modules/transport/ibgda/ibgda.cpp && \
sed -i '1815,1816 s/^/\/\/ /' src/modules/transport/ibgda/ibgda.cpp && \
sed -i '3468,3470 s/^/\/\/ /' src/modules/transport/ibgda/ibgda.cpp && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ \
-DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} \
-DNVSHMEM_BUILD_EXAMPLES=OFF \
-DNVSHMEM_BUILD_NVSHMEM4PY=OFF \
-DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 \
-DCMAKE_CUDA_ARCHITECTURES=90 && \
cmake --build build --target install --parallel $(nproc) && \
cd .. && \
rm -rf nvshmem_src*
# # install DeepEP
RUN cd DeepEP && \
export CUDA_HOME=/usr/local/cuda && \
if [ "${CUDA_ARCH}" = "sm_90" ]; then \
export TORCH_CUDA_ARCH_LIST="9.0" && \
NVSHMEM_DIR=${NVSHMEM_DIR} \
pip install -v .; \
else \
export TORCH_CUDA_ARCH_LIST="8.0" && \
DISABLE_SM90_FEATURES=1 \
pip install -v .; \
fi && \
cd ..
RUN chown -R user:user /home/user/DeepEP
USER user
WORKDIR /home/user
ENV PATH="/home/user/.cargo/bin:/usr/local/gdrcopy/bin:/home/user/nvshmem/bin:/opt/hpcx/sharp/bin:/opt/hpcx/clusterkit/bin:/opt/hpcx/hcoll/bin:/opt/hpcx/ucc/bin:/opt/hpcx/ucx/bin:/opt/hpcx/ompi/bin:/home/user/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
ENV LD_LIBRARY_PATH="/usr/local/gdrcopy/lib:/home/user/nvshmem/lib:/opt/hpcx/nccl_rdma_sharp_plugin/lib:/opt/hpcx/ucc/lib/ucc:/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib/ucx:/opt/hpcx/ucx/lib:/opt/hpcx/sharp/lib:/opt/hpcx/hcoll/lib:/opt/hpcx/ompi/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
ENV NVSHMEM_DIR="/home/user/nvshmem"
ENV OPEL_PREFIX="/opt/hpcx/ompi"
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
RUN python3 -m venv venv
ENV PATH="/home/user/.cargo/bin:${PATH}"
ENV PATH="/home/user/venv/bin:$PATH"
ENV VIRTUAL_ENV="/home/user/venv"
RUN python -m pip install --upgrade pip wheel setuptools-rust build ninja uv
RUN git clone https://github.com/sgl-project/sglang.git -b v0.5.3.post3 && \
cd sglang && \
python -m pip install sgl-kernel==0.3.15 && \
PIP_CONSTRAINT="" python -m pip install -e "python[all]" --extra-index-url https://download.pytorch.org/whl/cu129
RUN cd /home/user/DeepEP && \
NVSHMEM_DIR=/home/user/nvshmem \
TORCH_CUDA_ARCH_LIST="9.0" \
DISABLE_SM90_FEATURES=0 \
DISABLE_AGGRESSIVE_PTX_INSTRS=0 \
python setup.py install
RUN TORCH_CUDA_ARCH_LIST="9.0" FLASHINFER_CUBIN_DOWNLOAD_THREADS=8 FLASHINFER_LOGGING_LEVEL=warning python -m flashinfer --download-cubin
RUN PIP_CONSTRAINT="" python -m uv pip install mooncake-transfer-engine meson ninja pybind11 tomlkit sglang-router==0.2.2
RUN git clone -b 0.7.1 --single-branch --depth 1 https://github.com/ai-dynamo/nixl.git && \
cd nixl && \
PIP_CONSTRAINT="" python -m pip install . && \
meson setup build -Ducx_path=/opt/hpcx/ucx -Dcudapath_inc=/usr/local/cuda/include -Dcudapath_lib=/usr/local/cuda/lib64 -Ddisable_mooncake_backend=false && \
ninja -C build && \
PIP_CONSTRAINT="" python -m pip install build/src/bindings/python/nixl-meta/nixl-*-py3-none-any.whl
RUN git clone https://github.com/deepseek-ai/FlashMLA.git flash-mla && \
cd flash-mla && \
git checkout 1408756a88e52a25196b759eaf8db89d2b51b5a1 && \
git submodule update --init --recursive && \
PIP_CONSTRAINT="" python -m pip install -v --no-build-isolation .
# Install fast-hadamard-transform
RUN git clone https://github.com/Dao-AILab/fast-hadamard-transform && \
cd fast-hadamard-transform && \
git checkout 7fd811c2b47f63b0b08d2582619f939e14dad77c && \
PIP_CONSTRAINT="" python -m pip install --no-build-isolation .
RUN python -m pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps
USER root
WORKDIR /home/user
RUN touch /etc/profile.d/mlspace.sh
RUN chown -R user:user /etc/profile.d/mlspace.sh
USER user
RUN echo "export LD_LIBRARY_PATH=/usr/local/gdrcopy/lib:/home/user/nvshmem/lib:/opt/hpcx/nccl_rdma_sharp_plugin/lib:/opt/hpcx/ucm/lib:/opt/hpcx/pmix/lib:/opt/hpcx/hcoll/lib64:/opt/hpcx/ompi/lib64:/opt/hpcx/ucc/lib/ucc:/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib/ucx:/opt/hpcx/ucx/lib:/opt/hpcx/sharp/lib:/opt/hpcx/hcoll/lib:/opt/hpcx/ompi/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib">/etc/profile.d/mlspace.sh \
&& echo "export PATH=/home/user/.cargo/bin:/usr/local/gdrcopy/bin:/home/user/nvshmem/bin:/opt/hpcx/sharp/bin:/opt/hpcx/clusterkit/bin:/opt/hpcx/hcoll/bin:/opt/hpcx/ucc/bin:/opt/hpcx/ucx/bin:/opt/hpcx/ompi/bin:/home/user/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" >> /etc/profile.d/mlspace.sh \
&& echo "export OPAL_PREFIX=/opt/hpcx/ompi" >> /etc/profile.d/mlspace.sh \
&& echo "export NVSHMEM_DIR=/home/user/nvshmem" >> /etc/profile.d/mlspace.sh \
&& echo "export TORCH_CUDA_ARCH_LIST=9" >> /etc/profile.d/mlspace.sh \
&& echo "export NCCL_PLUGIN_PATH=/opt/hpcx/nccl_rdma_sharp_plugin/lib" >> /etc/profile.d/mlspace.sh
ENV NVIDIA_GDRCOPY=enabled