try deploy on a10

2025-03-19 16:19:59 +01:00 · 2025-03-19 16:19:59 +01:00 · ce73fe1ecf
parent 12df581e7f
commit ce73fe1ecf
3 changed files with 68 additions and 2 deletions
--- a/43
+++ b/43
@ -0,0 +1,43 @@
+# Use the official Python image as the base image.
+FROM python:3.10-slim
+
+# Declare build argument with a default value (0 means GPU not enabled)
+ARG ENABLE_CUDA=0
+
+# Set environment variables to optimize Python behavior in the container.
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV ENABLE_CUDA=1
+
+# Set the working directory inside the container.
+WORKDIR /app
+
+# Install system dependencies.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        libmagic1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy the requirements file.
+COPY requirements.txt .
+
+# Conditionally install dependencies:
+# - If ENABLE_CUDA=1, install all dependencies.
+# - Otherwise, filter out GPU-specific dependencies.
+RUN pip install --upgrade pip && \
+    if [ "$ENABLE_CUDA" -eq "1" ]; then \
+        pip install --no-cache-dir -r requirements.txt; \
+    else \
+        grep -v 'llama_index.readers.docling' requirements.txt > requirements_filtered.txt && \
+        pip install --no-cache-dir -r requirements_filtered.txt; \
+    fi
+
+# Copy the entire project directory into the container.
+COPY . .
+
+# Expose the port on which FastAPI will run.
+EXPOSE 7860
+# Define the default command to run the FastAPI application using uvicorn.
+CMD ["python", "app.py"]
--- a/app.py
+++ b/app.py
@ -53,7 +53,6 @@ model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview",
        #_attn_implementation="flash_attention_2"
        ).to("cuda")

-@spaces.GPU
 def model_inference(
    input_dict, history
 ): 
@ -151,4 +150,4 @@ demo = gr.ChatInterface(fn=model_inference, title="SmolDocling-256M: Ultra-compa
            cache_examples=False
            )

-demo.launch(debug=True)
+demo.launch(debug=True,server_name="0.0.0.0", server_port=7860)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,24 @@
+  docling_app:
+    build:
+      context: .
+      args:
+        ENABLE_CUDA: '1'
+    container_name: gradio_app
+    restart: unless-stopped
+    ports:
+      - "7860:7860"  # Gradio application port
+    volumes:
+      - ./app/services/weaviate_service.py:/gradio_app/weaviate_service.py
+    environment:
+      ENABLE_CUDA: '1'
+      NVIDIA_VISIBLE_DEVICES: '3'
+    deploy:
+        resources:
+            reservations:
+              devices:
+                - driver: nvidia
+                  device_ids: ['3']  # Specify GPU 3
+                  capabilities: [gpu]
+    shm_size: 1g
+    depends_on:
+      - rag_app