From 61303b8155cb6d389cc4453ca83cc6a01934667d Mon Sep 17 00:00:00 2001 From: Ahmed Nassar Date: Mon, 17 Mar 2025 13:20:29 +0000 Subject: [PATCH] Update app.py --- app.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/app.py b/app.py index f467107..b857022 100644 --- a/app.py +++ b/app.py @@ -99,7 +99,6 @@ def model_inference( "pixel_values": inputs.pixel_values, "attention_mask": inputs.attention_mask, "num_return_sequences": 1, - "no_repeat_ngram_size": 10, "max_new_tokens": 8192, } @@ -111,24 +110,30 @@ def model_inference( yield "..." buffer = "" - doctag_output = "" + full_output = "" for new_text in streamer: - if new_text != "": - buffer += html.escape(new_text) - doctag_output += new_text + full_output += new_text + buffer += html.escape(new_text) yield buffer - if any(tag in doctag_output for tag in ["", "", "", "", ""]): - # final_output = buffer - # cleaned_output = final_output[len(inputs.input_ids):] if len(final_output) > prompt_length else final_output + cleaned_output = full_output.replace("", "").strip() + + if cleaned_output: + doctag_output = cleaned_output + yield cleaned_output + + if any(tag in doctag_output for tag in ["", "", "", "", ""]): doc = DoclingDocument(name="Document") if "" in doctag_output: doctag_output = doctag_output.replace("", "").replace("", "") doctag_output = re.sub(r'()(?!.*)<[^>]+>', r'\1', doctag_output) - + + print(doctag_output) + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images) doc.load_from_doctags(doctags_doc) + print(doc) yield f"**MD Output:**\n\n{doc.export_to_markdown()}" examples=[[{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}],