Skip to content

Commit

Permalink
loosebox for quotes by default, unless explicitly disabled
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 3, 2024
1 parent 9001bc2 commit 0881a41
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 16 deletions.
18 changes: 9 additions & 9 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ def _load_pdf(pdf, flatten_pdf):
# Must be called on the parent pdf, before the page was retrieved
if flatten_pdf:
pdf.init_forms()

return pdf


def _get_page_range(page_range, flatten_pdf=False):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf)
def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = inference(text_chars, model)
return pages

Expand All @@ -44,17 +44,17 @@ def worker_init(pdf_path, flatten_pdf):
atexit.register(partial(worker_shutdown, pdf_doc))


def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None):
pdf_doc = _load_pdf(pdf_path, flatten_pdf)
if page_range is None:
page_range = range(len(pdf_doc))

if workers is not None:
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference

if workers is None or workers <= 1:
model = get_model()
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf)
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pdf_doc.close()
return inference(text_chars, model)

Expand All @@ -65,7 +65,7 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
page_range_chunks = [page_range[i * pages_per_worker:(i + 1) * pages_per_worker] for i in range(workers)]

with ProcessPoolExecutor(max_workers=workers, initializer=worker_init, initargs=(pdf_path, flatten_pdf)) as executor:
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf)))
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf), repeat(quote_loosebox)))

ordered_pages = [page for sublist in pages for page in sublist]
return ordered_pages
Expand Down Expand Up @@ -94,8 +94,8 @@ def _process_span(span, page_width, page_height, keep_chars):
char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height)


def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, workers=None):
pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf)
def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox)
for page in pages:
page_width, page_height = page["width"], page["height"]
for block in page["blocks"]:
Expand Down
14 changes: 7 additions & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@


def update_previous_fonts(char_infos: List, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int):
min_update = max(0, i - fontname_sample_freq) # Minimum index to update
for j in range(i-1, min_update, -1): # Goes from i to min_update
min_update = max(0, i - fontname_sample_freq) # Minimum index to update
for j in range(i - 1, min_update, -1): # Goes from i to min_update
fontname, fontflags = get_fontname(text_page, j)

# If we hit the region with the previous fontname, we can bail out
Expand All @@ -26,7 +26,7 @@ def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
raise PdfiumError("Failed to flatten annotations / form fields.")


def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
def get_pdfium_chars(pdf, page_range, flatten_pdf, quote_loosebox=True, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
blocks = []

for page_idx in page_range:
Expand All @@ -39,7 +39,7 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
# Flattening invalidates existing handles to the page.
# It is necessary to re-initialize the page handle after flattening.
page = pdf.get_page(page_idx)

text_page = page.get_textpage()
try:
page_rotation = page.get_rotation()
Expand Down Expand Up @@ -91,8 +91,8 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
update_previous_fonts(char_infos, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq)

rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * rad_to_deg # convert from radians to degrees
use_loosebox = rotation == 0 and not char == "'" # Loose doesn't work properly when charbox is rotated or when it's a quote
rotation = rotation * rad_to_deg # convert from radians to degrees
use_loosebox = rotation == 0 and (not char == "'" or quote_loosebox) # Loose doesn't work properly when charbox is rotated or when it's a quote
coords = text_page.get_charbox(i, loose=use_loosebox)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True)

Expand All @@ -113,4 +113,4 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
text_chars["chars"] = char_infos
text_chars["total_chars"] = total_chars
blocks.append(text_chars)
return blocks
return blocks

0 comments on commit 0881a41

Please sign in to comment.