diff --git a/.gitattributes b/.gitattributes index a6344aa..58ddd8f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png filter=lfs diff=lfs merge=lfs -text +example_images/annual_rep_14.png filter=lfs diff=lfs merge=lfs -text +example_images/annual_rep_15.png filter=lfs diff=lfs merge=lfs -text +example_images/gazette_de_france.jpg filter=lfs diff=lfs merge=lfs -text +example_images/paper_3.png filter=lfs diff=lfs merge=lfs -text +example_images/redhat.png filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index a0630cb..22734aa 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,13 @@ --- -title: SmolDocling 256M Demo -emoji: 🖼 -colorFrom: purple -colorTo: red +title: SmolVLM +emoji: 📊 +colorFrom: blue +colorTo: green sdk: gradio -sdk_version: 5.0.1 +sdk_version: 5.12.0 app_file: app.py pinned: false +license: apache-2.0 --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file diff --git a/app.py b/app.py index 652dc45..ec243ea 100644 --- a/app.py +++ b/app.py @@ -1,154 +1,152 @@ import gradio as gr -import numpy as np +from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer +from transformers.image_utils import load_image +from threading import Thread +import re +import time +import torch +import spaces +import re +import ast +import html import random -# import spaces #[uncomment to use ZeroGPU] -from diffusers import DiffusionPipeline -import torch +from PIL import Image, ImageOps -device = "cuda" if torch.cuda.is_available() else "cpu" -model_repo_id = "stabilityai/sdxl-turbo" # Replace to the model you would like to use +from docling_core.types.doc import DoclingDocument +from docling_core.types.doc.document import DocTagsDocument -if torch.cuda.is_available(): - torch_dtype = torch.float16 -else: - torch_dtype = torch.float32 +def add_random_padding(image, min_percent=0.1, max_percent=0.10): + image = image.convert("RGB") -pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype) -pipe = pipe.to(device) + width, height = image.size -MAX_SEED = np.iinfo(np.int32).max -MAX_IMAGE_SIZE = 1024 + pad_w_percent = random.uniform(min_percent, max_percent) + pad_h_percent = random.uniform(min_percent, max_percent) + + pad_w = int(width * pad_w_percent) + pad_h = int(height * pad_h_percent) + + corner_pixel = image.getpixel((0, 0)) # Top-left corner + padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel) + + return padded_image + +def normalize_values(text, target_max=500): + def normalize_list(values): + max_value = max(values) if values else 1 + return [round((v / max_value) * target_max) for v in values] + + def process_match(match): + num_list = ast.literal_eval(match.group(0)) + normalized = normalize_list(num_list) + return "".join([f"" for num in normalized]) + + pattern = r"\[([\d\.\s,]+)\]" + normalized_text = re.sub(pattern, process_match, text) + return normalized_text -# @spaces.GPU #[uncomment to use ZeroGPU] -def infer( - prompt, - negative_prompt, - seed, - randomize_seed, - width, - height, - guidance_scale, - num_inference_steps, - progress=gr.Progress(track_tqdm=True), -): - if randomize_seed: - seed = random.randint(0, MAX_SEED) +processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") +model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview", + torch_dtype=torch.bfloat16, + #_attn_implementation="flash_attention_2" + ).to("cuda") - generator = torch.Generator().manual_seed(seed) +@spaces.GPU +def model_inference( + input_dict, history +): + text = input_dict["text"] + print(input_dict["files"]) + if len(input_dict["files"]) > 1: + if "OTSL" in text or "code" in text: + images = [add_random_padding(load_image(image)) for image in input_dict["files"]] + else: + images = [load_image(image) for image in input_dict["files"]] - image = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - guidance_scale=guidance_scale, - num_inference_steps=num_inference_steps, - width=width, - height=height, - generator=generator, - ).images[0] + elif len(input_dict["files"]) == 1: + if "OTSL" in text or "code" in text: + images = [add_random_padding(load_image(input_dict["files"][0]))] + else: + images = [load_image(input_dict["files"][0])] - return image, seed + else: + images = [] + if text == "" and not images: + gr.Error("Please input a query and optionally image(s).") -examples = [ - "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", - "An astronaut riding a green horse", - "A delicious ceviche cheesecake slice", -] + if text == "" and images: + gr.Error("Please input a text query along the image(s).") -css = """ -#col-container { - margin: 0 auto; - max-width: 640px; -} -""" + if "OCR at text at" in text or "Identify element" in text or "formula" in text: + text = normalize_values(text, target_max=500) -with gr.Blocks(css=css) as demo: - with gr.Column(elem_id="col-container"): - gr.Markdown(" # Text-to-Image Gradio Template") + resulting_messages = [ + { + "role": "user", + "content": [{"type": "image"} for _ in range(len(images))] + [ + {"type": "text", "text": text} + ] + } + ] + prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True) + inputs = processor(text=prompt, images=[images], return_tensors="pt").to('cuda') - with gr.Row(): - prompt = gr.Text( - label="Prompt", - show_label=False, - max_lines=1, - placeholder="Enter your prompt", - container=False, + generation_args = { + "input_ids": inputs.input_ids, + "pixel_values": inputs.pixel_values, + "attention_mask": inputs.attention_mask, + "num_return_sequences": 1, + "no_repeat_ngram_size": 10, + "max_new_tokens": 8192, + } + + streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False) + generation_args = dict(inputs, streamer=streamer, max_new_tokens=8192) + + thread = Thread(target=model.generate, kwargs=generation_args) + thread.start() + + yield "..." + buffer = "" + doctag_output = "" + + for new_text in streamer: + if new_text != "": + buffer += html.escape(new_text) + doctag_output += new_text + yield buffer + + if any(tag in doctag_output for tag in ["", "", "", "", ""]): + # final_output = buffer + # cleaned_output = final_output[len(inputs.input_ids):] if len(final_output) > prompt_length else final_output + doc = DoclingDocument(name="Document") + if "" in doctag_output: + doctag_output = doctag_output.replace("", "").replace("", "") + doctag_output = re.sub(r'()(?!.*)<[^>]+>', r'\1', doctag_output) + + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images) + doc.load_from_doctags(doctags_doc) + yield f"**MD Output:**\n\n{doc.export_to_markdown()}" + +examples=[[{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}], + [{"text": "Convert this table to OTSL.", "files": ["example_images/image-2.jpg"]}], + [{"text": "Convert code to text.", "files": ["example_images/7666.jpg"]}], + [{"text": "Convert formula to latex.", "files": ["example_images/2433.jpg"]}], + [{"text": "Convert chart to OTSL.", "files": ["example_images/06236926002285.png"]}], + [{"text": "OCR the text in location [47, 531, 167, 565]", "files": ["example_images/s2w_example.png"]}], + [{"text": "Extract all section header elements on the page.", "files": ["example_images/paper_3.png"]}], + [{"text": "Identify element at location [123, 413, 1059, 1061]", "files": ["example_images/redhat.png"]}], + [{"text": "Convert this page to docling.", "files": ["example_images/gazette_de_france.jpg"]}], + ] + +demo = gr.ChatInterface(fn=model_inference, title="SmolDocling-256M: Ultra-compact VLM for Document Conversion 💫", + description="Play with [ds4sd/SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.", + examples=examples, + textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True, + cache_examples=False ) - run_button = gr.Button("Run", scale=0, variant="primary") - - result = gr.Image(label="Result", show_label=False) - - with gr.Accordion("Advanced Settings", open=False): - negative_prompt = gr.Text( - label="Negative prompt", - max_lines=1, - placeholder="Enter a negative prompt", - visible=False, - ) - - seed = gr.Slider( - label="Seed", - minimum=0, - maximum=MAX_SEED, - step=1, - value=0, - ) - - randomize_seed = gr.Checkbox(label="Randomize seed", value=True) - - with gr.Row(): - width = gr.Slider( - label="Width", - minimum=256, - maximum=MAX_IMAGE_SIZE, - step=32, - value=1024, # Replace with defaults that work for your model - ) - - height = gr.Slider( - label="Height", - minimum=256, - maximum=MAX_IMAGE_SIZE, - step=32, - value=1024, # Replace with defaults that work for your model - ) - - with gr.Row(): - guidance_scale = gr.Slider( - label="Guidance scale", - minimum=0.0, - maximum=10.0, - step=0.1, - value=0.0, # Replace with defaults that work for your model - ) - - num_inference_steps = gr.Slider( - label="Number of inference steps", - minimum=1, - maximum=50, - step=1, - value=2, # Replace with defaults that work for your model - ) - - gr.Examples(examples=examples, inputs=[prompt]) - gr.on( - triggers=[run_button.click, prompt.submit], - fn=infer, - inputs=[ - prompt, - negative_prompt, - seed, - randomize_seed, - width, - height, - guidance_scale, - num_inference_steps, - ], - outputs=[result, seed], - ) - -if __name__ == "__main__": - demo.launch() +demo.launch(debug=True, share=True) \ No newline at end of file diff --git a/example_images/06236926002285.png b/example_images/06236926002285.png new file mode 100644 index 0000000..9d26ce1 Binary files /dev/null and b/example_images/06236926002285.png differ diff --git a/example_images/2433.jpg b/example_images/2433.jpg new file mode 100644 index 0000000..2df9545 Binary files /dev/null and b/example_images/2433.jpg differ diff --git a/example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png b/example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png new file mode 100644 index 0000000..a3129dc --- /dev/null +++ b/example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069ec77320ef4de477397e03f39e482c9f755122864894d32dd48696059182a8 +size 296082 diff --git a/example_images/7666.jpg b/example_images/7666.jpg new file mode 100644 index 0000000..bb4e3e6 Binary files /dev/null and b/example_images/7666.jpg differ diff --git a/example_images/annual_rep_14.png b/example_images/annual_rep_14.png new file mode 100644 index 0000000..014424d --- /dev/null +++ b/example_images/annual_rep_14.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a71d26f82d73293a93128d077e630e06cab96309ea7b56249187cb41042efbd0 +size 350720 diff --git a/example_images/annual_rep_15.png b/example_images/annual_rep_15.png new file mode 100644 index 0000000..886e2ad --- /dev/null +++ b/example_images/annual_rep_15.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:077c718039fba1cfd412d6716129febb1b8bdc54f77f3a2dccc6ed4176846252 +size 269819 diff --git a/example_images/examples_invoice.png b/example_images/examples_invoice.png new file mode 100644 index 0000000..5ab7bfb Binary files /dev/null and b/example_images/examples_invoice.png differ diff --git a/example_images/gazette_de_france.jpg b/example_images/gazette_de_france.jpg new file mode 100644 index 0000000..d67c719 --- /dev/null +++ b/example_images/gazette_de_france.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70b054b6e9484f9cf5679ba712d252aa47c7a8a8fbc80cf238538b85f7386540 +size 434881 diff --git a/example_images/image-2.jpg b/example_images/image-2.jpg new file mode 100644 index 0000000..eefb89d Binary files /dev/null and b/example_images/image-2.jpg differ diff --git a/example_images/paper_3.png b/example_images/paper_3.png new file mode 100644 index 0000000..21bd209 --- /dev/null +++ b/example_images/paper_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee89438e58beb702aa6940c002d3ff7e5dfd2bae8e697164e718f2170014d6f +size 430757 diff --git a/example_images/redhat.png b/example_images/redhat.png new file mode 100644 index 0000000..05ad164 --- /dev/null +++ b/example_images/redhat.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6579bffac204ed8641e52f274c24476fbbd616257f5db5e6a01df670cd9ec0a7 +size 247542 diff --git a/example_images/s2w_example.png b/example_images/s2w_example.png new file mode 100644 index 0000000..5dcd4b0 Binary files /dev/null and b/example_images/s2w_example.png differ diff --git a/requirements.txt b/requirements.txt index 73d01db..a0e32a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ -accelerate -diffusers -invisible_watermark torch +accelerate +huggingface_hub +gradio transformers -xformers \ No newline at end of file +spaces +docling +docling-core \ No newline at end of file