try deploy on a10

2025-03-19 16:19:59 +01:00 · 2025-03-19 16:19:59 +01:00 · ce73fe1ecf
parent 12df581e7f
commit ce73fe1ecf
3 changed files with 68 additions and 2 deletions
--- a/43
+++ b/43
@ -0,0 +1,43 @@
 # Use the official Python image as the base image.
 FROM python:3.10-slim
 # Declare build argument with a default value (0 means GPU not enabled)
 ARG ENABLE_CUDA=0
 # Set environment variables to optimize Python behavior in the container.
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV ENABLE_CUDA=1
 # Set the working directory inside the container.
 WORKDIR /app
 # Install system dependencies.
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        curl \
        libmagic1 \
    && rm -rf /var/lib/apt/lists/*
 # Copy the requirements file.
 COPY requirements.txt .
 # Conditionally install dependencies:
 # - If ENABLE_CUDA=1, install all dependencies.
 # - Otherwise, filter out GPU-specific dependencies.
 RUN pip install --upgrade pip && \
    if [ "$ENABLE_CUDA" -eq "1" ]; then \
        pip install --no-cache-dir -r requirements.txt; \
    else \
        grep -v 'llama_index.readers.docling' requirements.txt > requirements_filtered.txt && \
        pip install --no-cache-dir -r requirements_filtered.txt; \
    fi
 # Copy the entire project directory into the container.
 COPY . .
 # Expose the port on which FastAPI will run.
 EXPOSE 7860
 # Define the default command to run the FastAPI application using uvicorn.
 CMD ["python", "app.py"]
--- a/app.py
+++ b/app.py
@ -53,7 +53,6 @@ model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview",
        #_attn_implementation="flash_attention_2"
        ).to("cuda")
@spaces.GPU
 def model_inference(
    input_dict, history
 ): 
@ -151,4 +150,4 @@ demo = gr.ChatInterface(fn=model_inference, title="SmolDocling-256M: Ultra-compa
            cache_examples=False
            )
-demo.launch(debug=True)
+demo.launch(debug=True,server_name="0.0.0.0", server_port=7860)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,24 @@
  docling_app:
    build:
      context: .
      args:
        ENABLE_CUDA: '1'
    container_name: gradio_app
    restart: unless-stopped
    ports:
      - "7860:7860"  # Gradio application port
    volumes:
      - ./app/services/weaviate_service.py:/gradio_app/weaviate_service.py
    environment:
      ENABLE_CUDA: '1'
      NVIDIA_VISIBLE_DEVICES: '3'
    deploy:
        resources:
            reservations:
              devices:
                - driver: nvidia
                  device_ids: ['3']  # Specify GPU 3
                  capabilities: [gpu]
    shm_size: 1g
    depends_on:
      - rag_app