#!/usr/bin/env bash
set -euo pipefail
BASE_DIR="/mnt/d/AI/docker-gemma4"
PATCH_DIR="$BASE_DIR/nvfp4_patch"
BUILD_DIR="$BASE_DIR/build"
HF_CACHE_DIR="$BASE_DIR/hf-cache"
LOG_DIR="$BASE_DIR/logs"
PATCH_FILE="$PATCH_DIR/gemma4_patched.py"
DOCKERFILE_PATH="$BUILD_DIR/Dockerfile"
BASE_IMAGE="vllm/vllm-openai:gemma4"
PATCHED_IMAGE="vllm-gemma4-nvfp4-patched"
CONTAINER_NAME="vllm-gemma4-nvfp4"
MODEL_ID="bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4"
SERVED_MODEL_NAME="gemma-4-26b-a4b-it-nvfp4"
GPU_MEMORY_UTILIZATION="0.88"
MAX_MODEL_LEN="512"
MAX_NUM_SEQS="1"
PORT=" "
if [[ -z "${HF_TOKEN:-}" ]]; then
echo "[ERROR] HF_TOKEN environment variable is empty."
echo "Please run the following first:"
echo "export HF_TOKEN='your_new_token'"
exit 1
fi
echo "[INFO] Preparing working directories"
mkdir -p "$PATCH_DIR" "$BUILD_DIR" "$HF_CACHE_DIR" "$LOG_DIR"
if [[ ! -f "$PATCH_FILE" ]]; then
echo "[INFO] Downloading patch file"
wget -O "$PATCH_FILE" "$PATCH_URL"
else
echo "[INFO] Patch file already exists: $PATCH_FILE"
fi
echo "[INFO] Checking patch file"
ls -l "$PATCH_FILE"
file "$PATCH_FILE" || true
echo "[INFO] Preparing build folder"
cp "$PATCH_FILE" "$BUILD_DIR/gemma4_patched.py"
cat > "$DOCKERFILE_PATH" <<'EOF'
FROM vllm/vllm-openai:gemma4
COPY gemma4_patched.py /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma4.py
EOF
echo "[INFO] Dockerfile created: $DOCKERFILE_PATH"
echo "[INFO] Starting custom image build: $PATCHED_IMAGE"
docker build -t "$PATCHED_IMAGE" "$BUILD_DIR"
if docker ps -a --format '{{.Names}}' | grep -qx "$CONTAINER_NAME"; then
echo "[INFO] Removing existing container: $CONTAINER_NAME"
docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true
fi
RUN_LOG="$LOG_DIR/gemma4_nvfp4_run.log"
echo "[INFO] Starting server"
echo "[INFO] Log file: $RUN_LOG"
docker run --rm -it \
--name "$CONTAINER_NAME" \
--gpus all \
--ipc=host \
-p ${PORT}:${PORT} \
-e HF_TOKEN="$HF_TOKEN" \
-e VLLM_NVFP4_GEMM_BACKEND=marlin \
-v "$HF_CACHE_DIR:/root/.cache/huggingface" \
-v "$LOG_DIR:/logs" \
--entrypoint /bin/bash \
"$PATCHED_IMAGE" \
-lc "vllm serve $MODEL_ID \
--served-model-name $SERVED_MODEL_NAME \
--host ~~~~ \
--port $PORT \
--trust-remote-code \
--quantization modelopt \
--moe-backend marlin \
--kv-cache-dtype fp8 \
--dtype auto \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--max-model-len $MAX_MODEL_LEN \
--max-num-seqs $MAX_NUM_SEQS" \
2>&1 | tee "$RUN_LOG"
wsl docker
vLLM: 0.18.2rc1.dev73+gdb7a17ecc
vllm dav
PyTorch: 2.10.0+cu129
CUDA (torch): 12.9
Python: 3.12.13
marlin backend
modelopt
transformers 5.4
[link] [comments]