Deepseek-OCR 配套工具

使用实例(待补充)

https://www.bilibili.com/video/BV1gRCRBjEqf/

集成 Deepseek-OCR 配套 API

Deepseek-OCR API示例#API 代码 (稍微完善)

python
import re
import unicodedata
import tkinter as tk
from tkinter import messagebox, Frame, Label, Button, Text, filedialog, Entry
from tkinter import ttk  # Import for Notebook
import tkinter.font as tkFont
from PIL import Image, ImageTk, ImageGrab
import requests
import base64
import io
import threading
import json
import os

# ---------- CORE PROCESSING FUNCTIONS ----------
def normalize_math_chars(text):
    """Converts Unicode math variables to plain characters wrapped in $...$"""
    def convert_char(match):
        original_text = match.group(0)
        normalized_text = unicodedata.normalize('NFKD', original_text)
        normalized_text = normalized_text.replace("−", "-")
        return f"${normalized_text}$"

    math_unicode_pattern = r'[\U0001D434-\U0001D467\U0001D6A8-\U0001D6E1\U0001D7EC-\U0001D7F6]+(?:[^\\]+)?'
    text = re.sub(math_unicode_pattern, convert_char, text)
    return text

def convert_latex(text):
    """Basic LaTeX conversion"""
    text = re.sub(r'\\\[([\s\S]*?)\\\]', r'$\1$', text)
    text = re.sub(r'\\\(([\s\S]*?)\\\)', r'$\1




, text)
    text = normalize_math_chars(text)
    return text

# ---------- UI HELPER FUNCTIONS ----------
def copy_to_clipboard():
    result_text = output_text.get("1.0", tk.END).strip()
    if result_text:
        root.clipboard_clear()
        root.clipboard_append(result_text)
        messagebox.showinfo("Success", "Result copied to clipboard!")
    else:
        messagebox.showwarning("Warning", "No text to copy.")

def convert_action():
    input_text_value = input_text.get("1.0", tk.END).strip()
    if input_text_value:
        converted = convert_latex(input_text_value)
        output_text.delete("1.0", tk.END)
        output_text.insert(tk.END, converted)
        output_text.see("1.0")
    else:
        messagebox.showwarning("Warning", "Please input some text to convert.")

def clear_input():
    input_text.delete("1.0", tk.END)

def select_all(event):
    event.widget.tag_add("sel", "1.0", "end")
    return 'break'

# ---------- IMAGE OCR ----------
pasted_image = None
photo = None

def paste_image():
    """Pastes an image from the clipboard and displays it."""
    global pasted_image, photo
    try:
        img = ImageGrab.grabclipboard()
        if isinstance(img, Image.Image):
            pasted_image = img
            display_img = pasted_image.copy()
            display_img.thumbnail((image_label.winfo_width() - 10, image_label.winfo_height() - 10))
            photo = ImageTk.PhotoImage(display_img)
            image_label.config(image=photo, text="")
            image_label.image = photo
        elif img:
            messagebox.showinfo("Info", "Clipboard does not contain an image.")
        else:
            messagebox.showwarning("Warning", "No image found on clipboard.")
    except Exception as e:
        messagebox.showerror("Error", f"Failed to paste image: {e}")

def run_image_ocr():
    """Prepares and runs the OCR for the pasted image."""
    if pasted_image is None:
        messagebox.showwarning("Warning", "Please paste an image first.")
        return

    prompt = prompt_text.get("1.0", tk.END).strip()
    if not prompt:
        prompt = "<|grounding|>Convert the document to markdown."

    ocr_button.config(state=tk.DISABLED, text="Processing...")
    thread = threading.Thread(target=api_call_image_ocr, args=(pasted_image, prompt))
    thread.start()

def api_call_image_ocr(image, prompt):
    """Sends the image to the DeepSeek OCR API."""
    try:
        buffered = io.BytesIO()
        image.convert("RGB").save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
        data_uri = f"data:image/jpeg;base64,{img_str}"

        payload = {
            "model": "deepseek-ocr",
            "messages": [{"role": "user", "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": data_uri}}]}],
            "stream": False
        }

        response = requests.post("http://localhost:8000/v1/chat/completions", json=payload, timeout=120)
        response.raise_for_status()
        result = response.json()
        ocr_text = result['choices'][0]['message']['content']
        root.after(0, update_ui_with_ocr_result, ocr_text)
    except requests.exceptions.RequestException as e:
        root.after(0, lambda: messagebox.showerror("Error", f"API request failed: {e}\n\nIs the flexible_app.py server running?"))
    except Exception as e:
        root.after(0, lambda: messagebox.showerror("Error", f"An error occurred during OCR: {e}"))
    finally:
        root.after(0, lambda: ocr_button.config(state=tk.NORMAL, text="DeepSeek OCR!"))

def update_ui_with_ocr_result(text):
    """Updates the input box with OCR result and triggers conversion."""
    input_text.delete("1.0", tk.END)
    input_text.insert(tk.END, text)
    convert_action()

# ---------- PDF DOCUMENT PROCESSING ----------
def rename_files_in_path(path):
    """
    Renames files with whitespace in their names at the given path.
    If path is a directory, scans all files within it.
    If path is a file, checks and renames only that file.
    Returns the potentially new path of the file, or the original directory path.
    """
    if not os.path.exists(path):
        messagebox.showwarning("Warning", f"Input path does not exist: {path}")
        return path

    try:
        if os.path.isdir(path):
            # It's a directory, scan and rename files inside
            for filename in os.listdir(path):
                if re.search(r'\s', filename):
                    old_file_path = os.path.join(path, filename)
                    new_filename = re.sub(r'\s+', '-', filename)
                    new_file_path = os.path.join(path, new_filename)

                    if os.path.exists(new_file_path):
                        print(f"Skipping rename: target '{new_file_path}' already exists.")
                        continue
                    
                    os.rename(old_file_path, new_file_path)
            return path  # Return original directory path

        elif os.path.isfile(path):
            # It's a single file
            directory, filename = os.path.split(path)
            if re.search(r'\s', filename):
                new_filename = re.sub(r'\s+', '-', filename)
                new_path = os.path.join(directory, new_filename)

                if os.path.exists(new_path):
                    print(f"Skipping rename: target '{new_path}' already exists.")
                    return path  # Return original path if target exists
                
                os.rename(path, new_path)
                return new_path  # Return the NEW path
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred during file renaming: {e}")
        return path
    
    return path # Return original path if no changes were needed

def browse_input_file():
    file_path = filedialog.askopenfilename(
        title="Select a PDF or Image File",
        filetypes=[
            ("Processable Files", "*.pdf *.png *.jpg *.jpeg *.bmp *.webp"),
            ("PDF Files", "*.pdf"),
            ("Image Files", "*.png *.jpg *.jpeg *.bmp *.webp")
        ]
    )
    if file_path:
        pdf_input_path_entry.delete(0, tk.END)
        pdf_input_path_entry.insert(0, file_path)

def browse_input_folder():
    folder_path = filedialog.askdirectory(title="Select a Folder with PDFs")
    if folder_path:
        pdf_input_path_entry.delete(0, tk.END)
        pdf_input_path_entry.insert(0, folder_path)

def browse_output_folder():
    folder_path = filedialog.askdirectory(title="Select an Output Folder")
    if folder_path:
        pdf_output_path_entry.delete(0, tk.END)
        pdf_output_path_entry.insert(0, folder_path)

def run_pdf_processing():
    """Prepares and runs the PDF processing via API after sanitizing filenames."""
    input_path_raw = pdf_input_path_entry.get().strip()
    output_path = pdf_output_path_entry.get().strip()

    if not input_path_raw or not output_path:
        messagebox.showwarning("Warning", "Please provide both input path and output folder path.")
        return

    # --- Automatically rename files before processing ---
    input_path = rename_files_in_path(input_path_raw)
    # Update the UI entry with the potentially new path
    pdf_input_path_entry.delete(0, tk.END)
    pdf_input_path_entry.insert(0, input_path)
    # --- End of auto-rename logic ---

    start_page_str = pdf_start_page_entry.get().strip()
    end_page_str = pdf_end_page_entry.get().strip()
    prefix_str = pdf_prefix_entry.get().strip()

    try:
        prefix = int(prefix_str) if prefix_str else 0
        first_page = int(start_page_str) + prefix if start_page_str else None
        last_page = int(end_page_str) + prefix if end_page_str else None
    except ValueError:
        messagebox.showwarning("Warning", "Page numbers and prefix must be integers.")
        return

    ocr_button.config(state=tk.DISABLED, text="Processing...")
    thread = threading.Thread(target=api_call_pdf_processing, args=(input_path, output_path, first_page, last_page))
    thread.start()

def api_call_pdf_processing(input_path, output_path, first_page, last_page):
    """Calls the /ocr/process_path endpoint."""
    try:
        payload = {
            "input_path": input_path,
            "output_path": output_path,
            "config": {
                "pdf_config": {
                    "first_page": first_page,
                    "last_page": last_page
                }
            }
        }
        response = requests.post("http://localhost:8000/ocr/process_path", json=payload, timeout=20)
        response.raise_for_status()
        
        res_json = response.json()
        root.after(0, lambda: messagebox.showinfo("Success", f"Processing started successfully on the server.\nInput: {res_json.get('input_path')}\nOutput: {res_json.get('output_path')}"))

    except requests.exceptions.RequestException as e:
        root.after(0, lambda: messagebox.showerror("Error", f"API request failed: {e}\n\nIs the flexible_app.py server running?"))
    except Exception as e:
        root.after(0, lambda: messagebox.showerror("Error", f"An error occurred during PDF processing: {e}"))
    finally:
        root.after(0, lambda: ocr_button.config(state=tk.NORMAL, text="DeepSeek OCR!"))

# ---------- OCR DISPATCHER ----------
def dispatch_ocr_action():
    """Checks the active tab and calls the appropriate OCR function."""
    selected_tab = notebook.tab(notebook.select(), "text")
    if selected_tab == "Image":
        run_image_ocr()
    elif selected_tab == "PDF":
        run_pdf_processing()

# ---------- UI SETUP ----------
root = tk.Tk()
root.title("OCR and LaTeX Converter v2.0")
root.geometry("900x800")

default_font = tkFont.Font(family="Consolas", size=12)
root.option_add("*Font", default_font)

main_frame = Frame(root)
main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)

# ==========================================
#               LEFT PANEL
# ==========================================
left_frame = Frame(main_frame, width=450)
left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=(0, 10))
left_frame.pack_propagate(False)

# --- Notebook for Tabs ---
notebook = ttk.Notebook(left_frame)
notebook.pack(fill=tk.BOTH, expand=True)

# --- IMAGE TAB ---
image_tab_frame = Frame(notebook)
notebook.add(image_tab_frame, text="Image")

image_label = Label(image_tab_frame, text="Click here\n\nCtrl+V to \n\npaste image", relief=tk.RIDGE, bd=2,justify="center",anchor="center")
image_label.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
image_label.bind("<Control-v>", lambda e: paste_image())
image_label.bind("<Control-V>", lambda e: paste_image())
image_label.bind("<Button-1>", lambda e: e.widget.focus_set())

# --- PDF TAB ---
pdf_tab_frame = Frame(notebook)
notebook.add(pdf_tab_frame, text="PDF")

# Use a main frame with padding that will contain the grid
pdf_controls_frame = Frame(pdf_tab_frame)
pdf_controls_frame.pack(fill=tk.BOTH, expand=True, padx=15, pady=20)

# Configure grid columns. A single column that will expand.
pdf_controls_frame.columnconfigure(0, weight=1)

# --- Input Path ---
Label(pdf_controls_frame, text="Input Path:").grid(row=0, column=0, sticky="w", pady=(5, 0))

input_frame = Frame(pdf_controls_frame)
input_frame.grid(row=1, column=0, sticky="ew", pady=(0, 10))
input_frame.columnconfigure(0, weight=1) # Entry inside this frame should expand

pdf_input_path_entry = Entry(input_frame)
pdf_input_path_entry.grid(row=0, column=0, sticky="ew")

# Frame for buttons to keep them together
input_btn_frame = Frame(input_frame)
input_btn_frame.grid(row=0, column=1, sticky="e", padx=(5, 0))
Button(input_btn_frame, text="File", command=browse_input_file).pack(side=tk.LEFT)
Button(input_btn_frame, text="Folder", command=browse_input_folder).pack(side=tk.LEFT, padx=(2, 0))

# --- Output Path ---
Label(pdf_controls_frame, text="Output Folder:").grid(row=2, column=0, sticky="w", pady=(5, 0))

output_frame = Frame(pdf_controls_frame)
output_frame.grid(row=3, column=0, sticky="ew", pady=(0, 10))
output_frame.columnconfigure(0, weight=1) # Entry inside this frame should expand

pdf_output_path_entry = Entry(output_frame)
pdf_output_path_entry.grid(row=0, column=0, sticky="ew")
Button(output_frame, text="Browse...", command=browse_output_folder).grid(row=0, column=1, padx=(5, 0))

# --- Page Range ---
Label(pdf_controls_frame, text="Page Range (opt):").grid(row=4, column=0, sticky="w", pady=(5, 0))

page_frame = Frame(pdf_controls_frame)
page_frame.grid(row=5, column=0, sticky="w", pady=(0, 10))

# Prefix
Label(page_frame, text="Prefix:").grid(row=0, column=0, sticky="w", pady=(0, 2))
pdf_prefix_entry = Entry(page_frame, width=10)
pdf_prefix_entry.insert(0, "0") # Default value
pdf_prefix_entry.grid(row=0, column=1, sticky="w", padx=(5, 0), pady=(0, 2))

# Start
Label(page_frame, text="Start:").grid(row=1, column=0, sticky="w", pady=(0, 2))
pdf_start_page_entry = Entry(page_frame, width=10)
pdf_start_page_entry.grid(row=1, column=1, sticky="w", padx=(5, 0), pady=(0, 2))

# End
Label(page_frame, text="End:").grid(row=2, column=0, sticky="w", pady=(0, 2))
pdf_end_page_entry = Entry(page_frame, width=10)
pdf_end_page_entry.grid(row=2, column=1, sticky="w", padx=(5, 0), pady=(0, 2))


# --- Unified OCR Button ---
ocr_button = Button(left_frame, text="DeepSeek OCR!", command=dispatch_ocr_action, height=2, bg="#D0E0FF")
ocr_button.pack(fill=tk.X, pady=3)

# ==========================================
#               RIGHT PANEL
# ==========================================
right_frame = Frame(main_frame, width=450)
right_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True)
right_frame.pack_propagate(False)

# --- Prompt Area ---
Label(right_frame, text="Prompt (for Image OCR):").pack(anchor="w")
prompt_text = Text(right_frame, height=2)
prompt_text.pack(fill=tk.X)
prompt_text.insert("1.0", "<|grounding|>Convert the document to markdown.")
prompt_text.bind("<Control-a>", select_all)

# --- Input Area ---
Label(right_frame, text="Input / OCR Result:").pack(anchor="w", pady=(10, 0))
input_text = Text(right_frame, height=2)
input_text.pack(fill=tk.BOTH, expand=True)
input_text.bind("<Control-a>", select_all)

# --- Output Area ---
Label(right_frame, text="Converted LaTeX:").pack(anchor="w", pady=(10, 0))
output_text = Text(right_frame, height=2)
output_text.pack(fill=tk.BOTH, expand=True)
output_text.bind("<Control-a>", select_all)

# --- Bottom Buttons ---
bottom_button_frame = Frame(right_frame)
bottom_button_frame.pack(fill=tk.X, pady=5)
Button(bottom_button_frame, text="Clear Input", command=clear_input).pack(side=tk.LEFT)
Button(bottom_button_frame, text="Convert", command=convert_action).pack(side=tk.LEFT, padx=5)
Button(bottom_button_frame, text="Copy Result", command=copy_to_clipboard).pack(side=tk.RIGHT)

root.mainloop()