try deploy on a10

This commit is contained in:
Maurizio Dipierro 2025-03-19 16:19:59 +01:00
parent 12df581e7f
commit ce73fe1ecf
3 changed files with 68 additions and 2 deletions

43
Dockerfile Normal file
View File

@ -0,0 +1,43 @@
# Use the official Python image as the base image.
FROM python:3.10-slim
# Declare build argument with a default value (0 means GPU not enabled)
ARG ENABLE_CUDA=0
# Set environment variables to optimize Python behavior in the container.
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV ENABLE_CUDA=1
# Set the working directory inside the container.
WORKDIR /app
# Install system dependencies.
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
curl \
libmagic1 \
&& rm -rf /var/lib/apt/lists/*
# Copy the requirements file.
COPY requirements.txt .
# Conditionally install dependencies:
# - If ENABLE_CUDA=1, install all dependencies.
# - Otherwise, filter out GPU-specific dependencies.
RUN pip install --upgrade pip && \
if [ "$ENABLE_CUDA" -eq "1" ]; then \
pip install --no-cache-dir -r requirements.txt; \
else \
grep -v 'llama_index.readers.docling' requirements.txt > requirements_filtered.txt && \
pip install --no-cache-dir -r requirements_filtered.txt; \
fi
# Copy the entire project directory into the container.
COPY . .
# Expose the port on which FastAPI will run.
EXPOSE 7860
# Define the default command to run the FastAPI application using uvicorn.
CMD ["python", "app.py"]

3
app.py
View File

@ -53,7 +53,6 @@ model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview",
#_attn_implementation="flash_attention_2" #_attn_implementation="flash_attention_2"
).to("cuda") ).to("cuda")
@spaces.GPU
def model_inference( def model_inference(
input_dict, history input_dict, history
): ):
@ -151,4 +150,4 @@ demo = gr.ChatInterface(fn=model_inference, title="SmolDocling-256M: Ultra-compa
cache_examples=False cache_examples=False
) )
demo.launch(debug=True) demo.launch(debug=True,server_name="0.0.0.0", server_port=7860)

24
docker-compose.yml Normal file
View File

@ -0,0 +1,24 @@
docling_app:
build:
context: .
args:
ENABLE_CUDA: '1'
container_name: gradio_app
restart: unless-stopped
ports:
- "7860:7860" # Gradio application port
volumes:
- ./app/services/weaviate_service.py:/gradio_app/weaviate_service.py
environment:
ENABLE_CUDA: '1'
NVIDIA_VISIBLE_DEVICES: '3'
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['3'] # Specify GPU 3
capabilities: [gpu]
shm_size: 1g
depends_on:
- rag_app