From ce73fe1ecf0b8a610418025ea034fcd375119599 Mon Sep 17 00:00:00 2001 From: Maurizio Dipierro Date: Wed, 19 Mar 2025 16:19:59 +0100 Subject: [PATCH] try deploy on a10 --- Dockerfile | 43 +++++++++++++++++++++++++++++++++++++++++++ app.py | 3 +-- docker-compose.yml | 24 ++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a136c6c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +# Use the official Python image as the base image. +FROM python:3.10-slim + +# Declare build argument with a default value (0 means GPU not enabled) +ARG ENABLE_CUDA=0 + +# Set environment variables to optimize Python behavior in the container. +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV ENABLE_CUDA=1 + +# Set the working directory inside the container. +WORKDIR /app + +# Install system dependencies. +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libmagic1 \ + && rm -rf /var/lib/apt/lists/* + +# Copy the requirements file. +COPY requirements.txt . + +# Conditionally install dependencies: +# - If ENABLE_CUDA=1, install all dependencies. +# - Otherwise, filter out GPU-specific dependencies. +RUN pip install --upgrade pip && \ + if [ "$ENABLE_CUDA" -eq "1" ]; then \ + pip install --no-cache-dir -r requirements.txt; \ + else \ + grep -v 'llama_index.readers.docling' requirements.txt > requirements_filtered.txt && \ + pip install --no-cache-dir -r requirements_filtered.txt; \ + fi + +# Copy the entire project directory into the container. +COPY . . + +# Expose the port on which FastAPI will run. +EXPOSE 7860 +# Define the default command to run the FastAPI application using uvicorn. +CMD ["python", "app.py"] diff --git a/app.py b/app.py index efe394e..10bdf28 100644 --- a/app.py +++ b/app.py @@ -53,7 +53,6 @@ model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview", #_attn_implementation="flash_attention_2" ).to("cuda") -@spaces.GPU def model_inference( input_dict, history ): @@ -151,4 +150,4 @@ demo = gr.ChatInterface(fn=model_inference, title="SmolDocling-256M: Ultra-compa cache_examples=False ) -demo.launch(debug=True) \ No newline at end of file +demo.launch(debug=True,server_name="0.0.0.0", server_port=7860) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..d9e3b77 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,24 @@ + docling_app: + build: + context: . + args: + ENABLE_CUDA: '1' + container_name: gradio_app + restart: unless-stopped + ports: + - "7860:7860" # Gradio application port + volumes: + - ./app/services/weaviate_service.py:/gradio_app/weaviate_service.py + environment: + ENABLE_CUDA: '1' + NVIDIA_VISIBLE_DEVICES: '3' + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['3'] # Specify GPU 3 + capabilities: [gpu] + shm_size: 1g + depends_on: + - rag_app \ No newline at end of file