-
Notifications
You must be signed in to change notification settings - Fork 79
/
Copy pathhocr-cut
executable file
·122 lines (102 loc) · 3.36 KB
/
hocr-cut
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
import argparse
import os
import sys
from lxml import html
from PIL import Image, ImageDraw
def get_prop(node, name):
title = node.get('title')
if not title:
return None
props = title.split(';')
for prop in props:
(key, args) = prop.split(None, 1)
if key == name:
return args.strip('"')
return None
def get_bbox(node):
bbox = get_prop(node, 'bbox')
if not bbox:
return None
return tuple([int(x) for x in bbox.split()])
parser = argparse.ArgumentParser(
description=('Cut a page (horizontally) into two pages in the middle '
'such that the most of the bounding boxes are separated '
'nicely, e.g. cutting double pages or double columns')
)
parser.add_argument('file', nargs='?', default=sys.stdin)
parser.add_argument('-d', '--debug', action="store_true")
args = parser.parse_args()
doc = html.parse(args.file)
pages = doc.xpath("//*[@class='ocr_page']")
for page in pages:
try:
filename = get_prop(page, 'image')
filename = os.path.join(os.path.dirname(args.file), filename)
image = Image.open(filename)
debug_image = Image.open(filename)
dr = ImageDraw.Draw(debug_image)
image_found = True
except IOError:
print("Warning: Image not found!")
args.debug = False
image_found = False
bbox = get_bbox(page)
middle = bbox[2] / 2
left_ends = []
right_starts = []
for line in doc.xpath("//*[@class='ocr_line']"):
b = get_bbox(line)
if (b[0] > middle):
pos = "right"
elif (b[2] < middle):
pos = "left"
elif (b[2] - middle > middle - b[1]):
pos = "right"
else:
pos = "left"
if (pos == "right"):
right_starts.append(b[0])
if (args.debug):
dr.rectangle(b, fill=32)
else:
left_ends.append(b[2])
if (args.debug):
dr.rectangle(b, fill=96)
left_ends.sort()
right_starts.sort()
n = len(left_ends)
m = len(right_starts)
middle_left = left_ends[n // 2]
middle_right = right_starts[m // 2]
middle = int((middle_left + middle_right) / 2)
print("Cutting at", middle)
if (image_found):
if filename[-4] == ".":
name = filename[:-3]
suffix = filename[-3:]
else:
name = filename
suffix = ""
if (args.debug):
dr.line(
(middle_left, 0, middle_left, debug_image.size[1]),
fill=64,
width=3)
dr.line(
(middle_right, 0, middle_right, debug_image.size[1]),
fill=64,
width=3)
dr.line(
(middle, 0, middle, debug_image.size[1]), fill=128, width=5)
debug_output = name + "cut." + suffix
debug_image.save(debug_output)
print("debug output is saved in", debug_output)
left = image.crop((0, 0, middle, image.size[1]))
left_name = name + "left." + suffix
left.save(left_name)
print("left page is saved in", left_name)
right = image.crop((middle, 0, image.size[0], image.size[1]))
right_name = name + "right." + suffix
right.save(right_name)
print("right page is saved in", right_name)