From ce73fe1ecf0b8a610418025ea034fcd375119599 Mon Sep 17 00:00:00 2001
From: Maurizio Dipierro <maurizio.dipierro@bvtech.com>
Date: Wed, 19 Mar 2025 16:19:59 +0100
Subject: [PATCH] try deploy on a10

---
 Dockerfile         | 43 +++++++++++++++++++++++++++++++++++++++++++
 app.py             |  3 +--
 docker-compose.yml | 24 ++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..a136c6c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,43 @@
+# Use the official Python image as the base image.
+FROM python:3.10-slim
+
+# Declare build argument with a default value (0 means GPU not enabled)
+ARG ENABLE_CUDA=0
+
+# Set environment variables to optimize Python behavior in the container.
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV ENABLE_CUDA=1
+
+# Set the working directory inside the container.
+WORKDIR /app
+
+# Install system dependencies.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        libmagic1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy the requirements file.
+COPY requirements.txt .
+
+# Conditionally install dependencies:
+# - If ENABLE_CUDA=1, install all dependencies.
+# - Otherwise, filter out GPU-specific dependencies.
+RUN pip install --upgrade pip && \
+    if [ "$ENABLE_CUDA" -eq "1" ]; then \
+        pip install --no-cache-dir -r requirements.txt; \
+    else \
+        grep -v 'llama_index.readers.docling' requirements.txt > requirements_filtered.txt && \
+        pip install --no-cache-dir -r requirements_filtered.txt; \
+    fi
+
+# Copy the entire project directory into the container.
+COPY . .
+
+# Expose the port on which FastAPI will run.
+EXPOSE 7860
+# Define the default command to run the FastAPI application using uvicorn.
+CMD ["python", "app.py"]
diff --git a/app.py b/app.py
index efe394e..10bdf28 100644
--- a/app.py
+++ b/app.py
@@ -53,7 +53,6 @@ model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview",
         #_attn_implementation="flash_attention_2"
         ).to("cuda")
 
-@spaces.GPU
 def model_inference(
     input_dict, history
 ): 
@@ -151,4 +150,4 @@ demo = gr.ChatInterface(fn=model_inference, title="SmolDocling-256M: Ultra-compa
             cache_examples=False
             )
 
-demo.launch(debug=True)
\ No newline at end of file
+demo.launch(debug=True,server_name="0.0.0.0", server_port=7860)
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..d9e3b77
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,24 @@
+  docling_app:
+    build:
+      context: .
+      args:
+        ENABLE_CUDA: '1'
+    container_name: gradio_app
+    restart: unless-stopped
+    ports:
+      - "7860:7860"  # Gradio application port
+    volumes:
+      - ./app/services/weaviate_service.py:/gradio_app/weaviate_service.py
+    environment:
+      ENABLE_CUDA: '1'
+      NVIDIA_VISIBLE_DEVICES: '3'
+    deploy:
+        resources:
+            reservations:
+              devices:
+                - driver: nvidia
+                  device_ids: ['3']  # Specify GPU 3
+                  capabilities: [gpu]
+    shm_size: 1g
+    depends_on:
+      - rag_app
\ No newline at end of file