Skip to content

Commit

Permalink
about to add major surgery to text overlay for 90 degree
Browse files Browse the repository at this point in the history
  • Loading branch information
virantha committed Feb 24, 2016
1 parent 35abfe4 commit 0258945
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 15 deletions.
2 changes: 1 addition & 1 deletion CHANGES_RECENT.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
Version Date Changes
------- -------- ------

v0.8.5 2/21/16 Better ctrl-c and cleanup behavior
v0.8.4 2/18/16 Maintenance release
v0.8.3 2/18/16 Bug fix for multiprocessing on windows, ctrl-c interrupt, and integer keywords
v0.8.2 12/8/14 Fixed imagemagick invocation on windows. Parallelized preprocessing and tesseract execution
v0.8.1 12/5/14 Added --skip-preprocess option, scan_interval option, and fixed too many open files bug during page overlay
v0.8.0 10/27/14 Added preprocessing to clean up prior to tesseract, bug fixes on file names with spaces/dots
v0.7.6 9/10/14 Fixed issue 17 rotation bug
======= ======== ======
92 changes: 79 additions & 13 deletions pypdfocr/pypdfocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,38 @@

class PyPdf(object):
"""Class to create pdfs from images"""
# Some regexes to compile once
regex_bbox = re.compile('bbox((\s+\d+){4})')
regex_baseline = re.compile('baseline((\s+[\d\.\-]+){2})')
regex_fontspec = re.compile('x_font\s+(.+);\s+x_fsize\s+(\d+)')
regex_textangle = re.compile('textangle\s+(\d+)')

def __init__(self, gs):
self.load_invisible_font()
self.gs = gs # Pointer to ghostscript object


pass

def get_transform(self, rotation, tx, ty):
# Code taken from here:
# http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824
# Unclear why PyPDF2 builtin page rotation functions don't work
translation = [[1, 0, 0],
[0, 1, 0],
[-tx,-ty,1]]
rotation = math.radians(rotation)
rotating = [[math.cos(rotation), math.sin(rotation),0],
[-math.sin(rotation),math.cos(rotation), 0],
[0, 0, 1]]
rtranslation = [[1, 0, 0],
[0, 1, 0],
[tx,ty,1]]
ctm = utils.matrixMultiply(translation, rotating)
ctm = utils.matrixMultiply(ctm, rtranslation)

return ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]

def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty):
# Code taken from here:
# http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824
Expand Down Expand Up @@ -103,7 +129,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):

if orig_rotation_angle != 0:
logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getHeight()/2)

# None of these commands worked for me:
#orig_pg.rotateCounterClockwise(orig_rotation_angle)
Expand All @@ -126,6 +152,21 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename

def _merge_and_write_single_page(self, original_page, ocr_text_page):
"""
Take two page objects, rotate the text page if necessary, and return the merged page
"""
orig_rotation_angle = int(original_page.get('/Rotate', 0))

if orig_rotation_angle != 0:
logging.info("Original Rotation: %s" % orig_rotation_angle)
self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getHeight()/2)
else:
original_page.mergePage(ocr_text_page)
original_page.compressContentStreams()
return original_page


def _get_img_dims(self, img_filename):
"""
:rval: (width, height, dpi)
Expand Down Expand Up @@ -193,8 +234,6 @@ def natural_keys(self, text):

def add_text_layer(self,pdf, hocrfile, page_num,height, dpi):
"""Draw an invisible text layer for OCR data"""
p1 = re.compile('bbox((\s+\d+){4})')
p2 = re.compile('baseline((\s+[\d\.\-]+){2})')
hocr = ElementTree()
hocr.parse(hocrfile)
logging.debug(xml.etree.ElementTree.tostring(hocr.getroot()))
Expand All @@ -211,10 +250,17 @@ def add_text_layer(self,pdf, hocrfile, page_num,height, dpi):
#for line in page.findall(".//span"):
if line.attrib['class'] != 'ocr_line':
continue
linebox = p1.search(line.attrib['title']).group(1).split()
linebox = self.regex_bbox.search(line.attrib['title']).group(1).split()
textangle = self.regex_textangle.search(line.attrib['title'])
if textangle:
textangle = self._atoi(textangle.group(1))
print("---------------BOOOOOOM--------------------")
print(textangle)
else:
textangle = 0

try:
baseline = p2.search(line.attrib['title']).group(1).split()
baseline = self.regex_baseline.search(line.attrib['title']).group(1).split()
except AttributeError:
baseline = [ 0, 0 ]

Expand All @@ -236,25 +282,45 @@ def add_text_layer(self,pdf, hocrfile, page_num,height, dpi):

if word.text is None:
continue
font_width = pdf.stringWidth(word.text.strip(), 'invisible', 8)
if font_width <= 0:
continue
box = p1.search(word.attrib['title']).group(1).split()
#font_width = pdf.stringWidth(word.text.strip(), 'invisible', 8)
#if font_width <= 0:
#continue
box = self.regex_bbox.search(word.attrib['title']).group(1).split()
box = [float(i) for i in box]
b = self.polyval(baseline, (box[0] + box[2]) / 2 - linebox[0]) + linebox[3]
text = pdf.beginText()
text.setTextRenderMode(3) # double invisible
text.setFont('invisible', 8)
#text.setTextRenderMode(3) # double invisible
text.setTextRenderMode(0)
#text.setFont('invisible', 8)
font_name, font_size = self._get_font_spec(word.attrib['title'])
#logging.debug(font_name, font_size)
text.setFont('Helvetica', font_size)
text.setTextOrigin(box[0] * 72 / dpi, height - b * 72 / dpi)
box_width = (box[2] - box[0]) * 72 / dpi
text.setHorizScale(100.0 * box_width / font_width)
#box_width = (box[2] - box[0]) * 72 / dpi
#text.setHorizScale(100.0 * box_width / font_width)
text.textLine(word.text.strip())
#logging.debug( "Pg%s: %s" % (page_num,word.text.strip()))
#pdf.saveState()
if textangle != 0:
#pdf.rotate(textangle)
text.setTextTransform(*(self.get_transform(90,0,0)))
pass
pdf.drawText(text)
#pdf.restoreState()

def polyval(self,poly, x):
return x * poly[0] + poly[1]


def _get_font_spec(self, tag):
fontspec = self.regex_fontspec.search(tag).groups()
if len(fontspec) != 2:
fontname = ""
fontsize = 8
else:
fontname, fontsize = fontspec
return (fontname, self._atoi(fontsize))

# Glyphless variation of vedaal's invisible font retrieved from
# http://www.angelfire.com/pr/pgpf/if.html, which says:
# 'Invisible font' is unrestricted freeware. Enjoy, Improve, Distribute freely
Expand Down
2 changes: 1 addition & 1 deletion pypdfocr/pypdfocr_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def make_hocr_from_pnm(self, img_filename):
error(self.msgs['TS_img_MISSING'] + " %s" % (img_filename))

logging.info("Running OCR on %s to create %s.html" % (img_filename, basename))
cmd = '%s "%s" "%s" -psm 1 -l %s hocr' % (self.binary, img_filename, basename, self.lang)
cmd = '%s "%s" "%s" -psm 1 -c hocr_font_info=1 -l %s hocr' % (self.binary, img_filename, basename, self.lang)
logging.info(cmd)
try:
ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
Expand Down

0 comments on commit 0258945

Please sign in to comment.