Загрузка данных


FROM nvcr.io/nvidia/pytorch:25.06-py3

# создаем пользователя с ID 1000 и группой с ID 1000
# cоздаем пустую директорию с правами user
RUN userdel ubuntu && useradd -u 1000 -m user --shell /bin/bash \
    && mkdir -p /home/jovyan && chown user:user /home/jovyan

# Ставим OpenSSH для MPI что бы соединятся между контейнирами
# Генерим ключи
RUN apt-get update \
    && apt-get install -y --no-install-recommends openssh-server\
    && rm -rf /var/lib/apt/lists/* \
    && mkdir -p /var/run/sshd \
    && ssh-keygen -A

# Allow OpenSSH to talk to containers without asking for confirmation
# Disable OpenSSH PAM
# Set port 2222
# chmod 0600 /etc/ssh/ssh_host_*_key
RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config.d/mlspace.conf \
    && echo "UsePAM no" > /etc/ssh/sshd_config.d/mlspace.conf \
    && echo "PubkeyAcceptedAlgorithms +ssh-rsa" >> /etc/ssh/sshd_config.d/mlspace.conf \
    && echo "Port 2222" >> /etc/ssh/sshd_config.d/mlspace.conf \
    && echo "PidFile /run/sshd/sshd.pid" >> /etc/ssh/sshd_config.d/mlspace.conf \
    && chmod 600 /etc/ssh/ssh_host_* \
    && chown user:user /etc/ssh/ssh_host_* \
    && chown user:user /run/sshd

RUN apt-get update && apt-get install -y screen libaio-dev

# Clearing the apt caches
RUN apt-get clean

# Building and installing nccl tests for current container
RUN cd /workspace \
    && git clone https://github.com/NVIDIA/nccl-tests.git \
    && cd nccl-tests \
    && make MPI=1 MPI_HOME=/opt/hpcx/ompi/ CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr/lib/x86_64-linux-gnu/ \
    && mv build /opt/nccl-tests \
    && cd /workspace \
    && rm -rf nccl-tests

ARG NVSHMEM_DIR=/home/user/nvshmem


####
# Все необходимые действия от пользователя делаются в секции ниже под USER
# Тут важно помнить, что если делается экспорт переменных в секции выше, то пользователь 
# в данной секции их не увидит, следовательно все должно располагаться тут
####

WORKDIR /home/user

USER root

ENV FLASH_ATTENTION_VERSION=2.7.2
RUN pip uninstall -y flash-attn
RUN cd /home/user && git clone -b v${FLASH_ATTENTION_VERSION} --single-branch --depth 1 https://github.com/Dao-AILab/flash-attention.git
RUN pip install packaging==24.1 setuptools==69.5.1
RUN cd /home/user/flash-attention && MAX_JOBS=32 python setup.py install
RUN cd /home/user/flash-attention/hopper && python setup.py install

WORKDIR /home/user/flash-attention
RUN cd csrc/layer_norm && pip install .
RUN cd csrc/fused_dense_lib && \
    pip install .

WORKDIR /home/user

###############
# GDRCopy
###############
ARG GDRCOPY_HOME=/usr/local/gdrcopy
RUN apt-get update && apt-get install -y --no-install-recommends nvidia-dkms-570

RUN dpkg -r libgdrapi gdrcopy && \
    wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz && \
    tar -zxvf v2.5.tar.gz && cd gdrcopy-2.5 && \
    make prefix=${GDRCOPY_HOME} install -j$(nproc) && \
    cd .. && rm v2.5.tar.gz && rm -rf gdrcopy-2.5


###############
# DeepEP
###############
# can be "sm80" or "sm90"
# for sm80 FP8, launch methods, TMA (Tensor Memory Access), all internode and low-latency features from NVSHMEM are disabled
ARG CUDA_ARCH="sm_90"
ARG DEEPEP_COMMIT=174c209fec3f03db86139e2a74150b6bf3447332

COPY configs.patch /tmp/configs.patch

RUN git clone https://github.com/deepseek-ai/DeepEP.git && \
    cd DeepEP && \
    git checkout ${DEEPEP_COMMIT} && \
    git apply /tmp/configs.patch && \
    cd ..

# # NVSHMEM
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
    tar xvf nvshmem_src_3.2.5-1.txz && \
    cd nvshmem_src && git apply /home/user/DeepEP/third-party/nvshmem.patch && \
    # sed -i for commenting warnings from ibgda about cpu fallbacks
    sed -i '968,971 s/^/\/\/ /' src/modules/transport/ibgda/ibgda.cpp && \
    sed -i '1815,1816 s/^/\/\/ /' src/modules/transport/ibgda/ibgda.cpp && \
    sed -i '3468,3470 s/^/\/\/ /' src/modules/transport/ibgda/ibgda.cpp && \
    NVSHMEM_SHMEM_SUPPORT=0 \
    NVSHMEM_UCX_SUPPORT=0 \
    NVSHMEM_USE_NCCL=0 \
    NVSHMEM_MPI_SUPPORT=0 \
    NVSHMEM_IBGDA_SUPPORT=1 \
    NVSHMEM_PMIX_SUPPORT=0 \
    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
    NVSHMEM_USE_GDRCOPY=1 \
    cmake -S . -B build/ \
        -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} \
        -DNVSHMEM_BUILD_EXAMPLES=OFF \
        -DNVSHMEM_BUILD_NVSHMEM4PY=OFF \
        -DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 \
        -DCMAKE_CUDA_ARCHITECTURES=90 && \
    cmake --build build --target install --parallel $(nproc) && \
    cd .. && \
    rm -rf nvshmem_src*


# # install DeepEP
RUN cd DeepEP && \
    export CUDA_HOME=/usr/local/cuda && \
    if [ "${CUDA_ARCH}" = "sm_90" ]; then \
        export TORCH_CUDA_ARCH_LIST="9.0" && \
        NVSHMEM_DIR=${NVSHMEM_DIR} \
        pip install -v .; \
    else \
        export TORCH_CUDA_ARCH_LIST="8.0" && \
        DISABLE_SM90_FEATURES=1 \
        pip install -v .; \
    fi && \
    cd ..

RUN chown -R user:user /home/user/DeepEP

USER user
WORKDIR /home/user

ENV PATH="/home/user/.cargo/bin:/usr/local/gdrcopy/bin:/home/user/nvshmem/bin:/opt/hpcx/sharp/bin:/opt/hpcx/clusterkit/bin:/opt/hpcx/hcoll/bin:/opt/hpcx/ucc/bin:/opt/hpcx/ucx/bin:/opt/hpcx/ompi/bin:/home/user/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
ENV LD_LIBRARY_PATH="/usr/local/gdrcopy/lib:/home/user/nvshmem/lib:/opt/hpcx/nccl_rdma_sharp_plugin/lib:/opt/hpcx/ucc/lib/ucc:/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib/ucx:/opt/hpcx/ucx/lib:/opt/hpcx/sharp/lib:/opt/hpcx/hcoll/lib:/opt/hpcx/ompi/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
ENV NVSHMEM_DIR="/home/user/nvshmem"
ENV OPEL_PREFIX="/opt/hpcx/ompi"

RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

RUN python3 -m venv venv

ENV PATH="/home/user/.cargo/bin:${PATH}"
ENV PATH="/home/user/venv/bin:$PATH"
ENV VIRTUAL_ENV="/home/user/venv"

RUN python -m pip install --upgrade pip wheel setuptools-rust build ninja uv

RUN git clone https://github.com/sgl-project/sglang.git -b v0.5.3.post3 && \
    cd sglang && \
    python -m pip install sgl-kernel==0.3.15 && \
    PIP_CONSTRAINT="" python -m pip install -e "python[all]" --extra-index-url https://download.pytorch.org/whl/cu129

RUN cd /home/user/DeepEP && \
    NVSHMEM_DIR=/home/user/nvshmem \
    TORCH_CUDA_ARCH_LIST="9.0" \
    DISABLE_SM90_FEATURES=0 \
    DISABLE_AGGRESSIVE_PTX_INSTRS=0 \
    python setup.py install

RUN TORCH_CUDA_ARCH_LIST="9.0" FLASHINFER_CUBIN_DOWNLOAD_THREADS=8 FLASHINFER_LOGGING_LEVEL=warning python -m flashinfer --download-cubin
RUN PIP_CONSTRAINT="" python -m uv pip install mooncake-transfer-engine meson ninja pybind11 tomlkit sglang-router==0.2.2

RUN git clone -b 0.7.1 --single-branch --depth 1 https://github.com/ai-dynamo/nixl.git && \
    cd nixl && \
    PIP_CONSTRAINT="" python -m pip install . && \
    meson setup build -Ducx_path=/opt/hpcx/ucx -Dcudapath_inc=/usr/local/cuda/include -Dcudapath_lib=/usr/local/cuda/lib64 -Ddisable_mooncake_backend=false && \
    ninja -C build && \
    PIP_CONSTRAINT="" python -m pip install build/src/bindings/python/nixl-meta/nixl-*-py3-none-any.whl

RUN git clone https://github.com/deepseek-ai/FlashMLA.git flash-mla && \
    cd flash-mla && \
    git checkout 1408756a88e52a25196b759eaf8db89d2b51b5a1 && \
    git submodule update --init --recursive && \
    PIP_CONSTRAINT="" python -m pip install -v --no-build-isolation .

# Install fast-hadamard-transform
RUN git clone https://github.com/Dao-AILab/fast-hadamard-transform && \
    cd fast-hadamard-transform && \
    git checkout 7fd811c2b47f63b0b08d2582619f939e14dad77c && \
    PIP_CONSTRAINT="" python -m pip install --no-build-isolation . 

RUN python -m pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps

USER root
WORKDIR /home/user

RUN touch /etc/profile.d/mlspace.sh
RUN chown -R user:user /etc/profile.d/mlspace.sh

USER user
RUN echo "export LD_LIBRARY_PATH=/usr/local/gdrcopy/lib:/home/user/nvshmem/lib:/opt/hpcx/nccl_rdma_sharp_plugin/lib:/opt/hpcx/ucm/lib:/opt/hpcx/pmix/lib:/opt/hpcx/hcoll/lib64:/opt/hpcx/ompi/lib64:/opt/hpcx/ucc/lib/ucc:/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib/ucx:/opt/hpcx/ucx/lib:/opt/hpcx/sharp/lib:/opt/hpcx/hcoll/lib:/opt/hpcx/ompi/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib">/etc/profile.d/mlspace.sh \
   && echo "export PATH=/home/user/.cargo/bin:/usr/local/gdrcopy/bin:/home/user/nvshmem/bin:/opt/hpcx/sharp/bin:/opt/hpcx/clusterkit/bin:/opt/hpcx/hcoll/bin:/opt/hpcx/ucc/bin:/opt/hpcx/ucx/bin:/opt/hpcx/ompi/bin:/home/user/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" >> /etc/profile.d/mlspace.sh \
   && echo "export OPAL_PREFIX=/opt/hpcx/ompi" >> /etc/profile.d/mlspace.sh \
   && echo "export NVSHMEM_DIR=/home/user/nvshmem" >> /etc/profile.d/mlspace.sh \
   && echo "export TORCH_CUDA_ARCH_LIST=9" >> /etc/profile.d/mlspace.sh \
   && echo "export NCCL_PLUGIN_PATH=/opt/hpcx/nccl_rdma_sharp_plugin/lib" >> /etc/profile.d/mlspace.sh

ENV NVIDIA_GDRCOPY=enabled