Skip to content

Commit

Permalink
rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
yuzhichang committed Sep 17, 2018
1 parent f95705a commit b16e633
Show file tree
Hide file tree
Showing 61 changed files with 297 additions and 6,754 deletions.
Binary file removed data_generator/background/1.png
Binary file not shown.
Binary file removed data_generator/background/10.png
Binary file not shown.
Binary file removed data_generator/background/11.png
Binary file not shown.
Binary file removed data_generator/background/12.png
Binary file not shown.
Binary file removed data_generator/background/13.png
Binary file not shown.
Binary file removed data_generator/background/14.png
Binary file not shown.
Binary file removed data_generator/background/15.png
Binary file not shown.
Binary file removed data_generator/background/16.png
Binary file not shown.
Binary file removed data_generator/background/17.png
Binary file not shown.
Binary file removed data_generator/background/18.png
Binary file not shown.
Binary file removed data_generator/background/19.png
Binary file not shown.
Binary file removed data_generator/background/2.png
Binary file not shown.
Binary file removed data_generator/background/20.png
Binary file not shown.
Binary file removed data_generator/background/21.png
Binary file not shown.
Binary file removed data_generator/background/22.png
Binary file not shown.
Binary file removed data_generator/background/23.png
Binary file not shown.
Binary file removed data_generator/background/24.png
Binary file not shown.
Binary file removed data_generator/background/25.png
Binary file not shown.
Binary file removed data_generator/background/26.png
Binary file not shown.
Binary file removed data_generator/background/27.png
Binary file not shown.
Binary file removed data_generator/background/28.png
Binary file not shown.
Binary file removed data_generator/background/29.png
Binary file not shown.
Binary file removed data_generator/background/3.png
Binary file not shown.
Binary file removed data_generator/background/30.png
Binary file not shown.
Binary file removed data_generator/background/31.png
Binary file not shown.
Binary file removed data_generator/background/32.png
Diff not rendered.
Binary file removed data_generator/background/33.png
Diff not rendered.
Binary file removed data_generator/background/34.png
Diff not rendered.
Binary file removed data_generator/background/4.png
Diff not rendered.
Binary file removed data_generator/background/5.png
Diff not rendered.
Binary file removed data_generator/background/6.png
Diff not rendered.
Binary file removed data_generator/background/7.png
Diff not rendered.
Binary file removed data_generator/background/8.png
Diff not rendered.
Binary file removed data_generator/background/9.png
Diff not rendered.
Binary file removed data_generator/background/fapiao2.jpg
Diff not rendered.
6,498 changes: 0 additions & 6,498 deletions data_generator/dict-chinese.txt

This file was deleted.

Binary file added data_generator/fonts_Chinese/STSONG.TTF
Binary file not shown.
Binary file added data_generator/fonts_Chinese/simfang.ttf
Binary file not shown.
Binary file added data_generator/fonts_Chinese/simkai.ttf
Binary file not shown.
Binary file not shown.
Binary file removed data_generator/fonts_English/ARIALNB.TTF
Binary file not shown.
Binary file removed data_generator/fonts_English/Amazon-Ember-Bold.ttf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed data_generator/fonts_English/Arial Black.TTF
Binary file not shown.
Binary file removed data_generator/fonts_English/Arial.ttf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed data_generator/fonts_English/Gotham-Ultra.otf
Binary file not shown.
Binary file removed data_generator/fonts_English/GothamRnd-Bold.otf
Binary file not shown.
Binary file removed data_generator/fonts_English/Verdana.ttf
Binary file not shown.
Binary file removed data_generator/fonts_English/arialbd.ttf
Binary file not shown.
Binary file removed data_generator/fonts_English/arialbi.ttf
Binary file not shown.
Binary file removed data_generator/fonts_English/attic.ttf
Binary file not shown.
Binary file removed data_generator/fonts_English/buttzill.ttf
Binary file not shown.
Binary file removed data_generator/fonts_English/tahomab0.ttf
Binary file not shown.
Binary file removed data_generator/fonts_English/verdanab.ttf
Binary file not shown.
Binary file removed data_generator/fonts_English/verdanaz.ttf
Binary file not shown.
297 changes: 297 additions & 0 deletions data_generator/gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Install dependencies:
# sudo pip install pathlib2 Pillow pygame scikit-image

import os
import re
import codecs
import shutil
import random
import inspect
import glob
import pygame
import pygame.locals
import pygame.image
import pygame.freetype
import pygame.transform
import pygame.draw
import pygame.surfarray
from pygame import Color
import numpy as np
from PIL import Image, ImageOps
try:
# The goal of pathlib2 is to provide a backport of standard pathlib module.
# refers to https://pypi.org/project/pathlib2/
from pathlib2 import Path
except Exception:
# pathlib is new in version 3.4.
# refers to https://docs.python.org/3/library/pathlib.html
from pathlib import Path

# https://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte
# The default encoding of Python 2 is 'ascii', Python 3 is 'utf-8'.
import sys
if sys.getdefaultencoding() != 'utf-8':
reload(sys)
sys.setdefaultencoding('utf-8')

# http://scikit-image.org/docs/dev/auto_examples/xx_applications/plot_morphology.html
import matplotlib.pyplot as plt
from skimage.morphology import erosion, dilation, opening, closing, white_tophat
from skimage.morphology import disk
from skimage import io


OUTPUT_DIR = os.path.join(str(Path.home()), "generated")
OUTPUT_NUM = 1000000 # shall be multiple of OUTPUT_BATCH
OUTPUT_BATCH = 1000
IMG_WIDTH = 200
IMG_HEIGHT = 32
MAX_LEFT_BLANK = 10
MIN_FONT_SIZE = int(IMG_HEIGHT / 3)
MAX_FONT_SIZE = IMG_HEIGHT - 4
MAX_TEXT_ANGLE = 0
MIN_TEXT_LEN = 2
MAX_TEXT_LEN = 10

mainDir = None
fontsChinese = None
chiPhrases = None


def initChineseSource(fp):
global chiPhrases
patt = re.compile('[\s,;!?,。;!?]+')
txt = codecs.open(fp, 'r', encoding='UTF-8',
errors='ignore').read()
print(txt[:1024])
chiPhrases = re.split(patt, txt)


def putTextAndRotate(txt, fontName):
'''
Put text, rotate, and crop. Retures an img of size IMG_WIDTH * IMG_HEIGHT.
There shall be some small pixel gap (1~3) with left/top/bottom border.
'''
SURF_WIDTH = 800
SURF_HEIGHT = 600
surf = pygame.Surface((800, 600))
surf.fill(Color(255, 255, 255, 0)) # white
# determine font size
fontSize = random.randint(MIN_FONT_SIZE, MAX_FONT_SIZE)
font = pygame.freetype.Font(fontName, fontSize)
if MAX_TEXT_ANGLE != 0:
font.rotation = random.randint(-MAX_TEXT_ANGLE, MAX_TEXT_ANGLE)
print(
'font: %s, fontSize: %s, rotation: %s, text: %s' % (os.path.basename(fontName), fontSize, font.rotation, txt))

surfRect = surf.get_rect()
txtRect = font.get_rect(txt)
while len(txt) > 0 and (txtRect.width >= IMG_WIDTH or txtRect.height >= IMG_HEIGHT):
print('txtRect %s is not covered by %s*%s' %
(txtRect, IMG_HEIGHT, IMG_WIDTH))
txt = txt[:-1]
txtRect = font.get_rect(txt)
if len(txt) == 0:
print('txt become empty')
return None, None
# txtX = random.randint(0, surfRect.width - IMG_WIDTH)
# txtY = random.randint(0, surfRect.height - IMG_HEIGHT)
txtX = SURF_WIDTH / 2
txtY = SURF_HEIGHT / 2
txtRect2 = font.render_to(surf, (txtX, txtY), txt)
# assert(txtRect == txtRect2)
'''
pygame.draw.rect(surf, Color(255, 0, 0, 0), pygame.Rect(
txtX, txtY, txtRect.width, txtRect.height), 1)
'''
'''
constrains:
0<= gapX <= 3
cropX + gapX = txtX
gapX >= -txtRect.left
gapX + txtRect.width <= IMG_WIDTH
0 <= gapY1 <= 3
0 <= gapY2 <= 3
gapY1 = txtY - cropY
gapY1 + gapY2 + txtRect.height <= IMG_HEIGHT
'''
gapX = random.randint(max(0, -txtRect.left),
min(IMG_WIDTH-txtRect.width, 3))
gapY1 = random.randint(0, min(3, int((IMG_HEIGHT - txtRect.height)/2)))
gapY2 = random.randint(0, min(3, int((IMG_HEIGHT - txtRect.height)/2)))
cropX = txtX - gapX
cropY = txtY - gapY1
ratioX = IMG_WIDTH*1.0/(txtRect.width + gapX)
ratioY = IMG_HEIGHT*1.0/(txtRect.height + gapY1 + gapY2)
assert(ratioX >= 1.0)
assert(ratioY >= 1.0)
ratio = min(ratioX, ratioY)
cropWidth = int(IMG_WIDTH*1.0/ratio)
cropHeight = int(IMG_HEIGHT*1.0/ratio)
cropRect = (cropX, cropY, cropWidth, cropHeight)
print(
'txtRect: %s, gapX: %s, gapY1: %s, gapY2: %s, cropRect: %s, ratioX: %s, ratioY: %s' % (txtRect, gapX, gapY1, gapY2, cropRect, ratioX, ratioY))
surf = surf.subsurface(cropRect)
surf = pygame.transform.scale(surf, (IMG_WIDTH, IMG_HEIGHT))
return surf, txt


# http://www.xiaoliangbai.com/2016/09/09/more-on-image-noise-generation
# https://www.cnblogs.com/gongxijun/p/6114232.html
def noise_generator(noise_type, image):
"""
Generate noise to a given Image based on required noise type
Input parameters:
image: ndarray (input image data. It will be converted to float)
noise_type: string
'gauss' Gaussian-distrituion based noise
'poission' Poission-distribution based noise
's&p' Salt and Pepper noise, 0 or 1
'speckle' Multiplicative noise using out = image + n*image
where n is uniform noise with specified mean & variance
"""
row, col, ch = image.shape
if noise_type == "gauss":
mean = 0.0
var = 0.01
sigma = var**0.5
gauss = np.array(image.shape)
gauss = np.random.normal(mean, sigma, (row, col, ch))
gauss = gauss.reshape(row, col, ch)
noisy = image + gauss
return noisy.astype('uint8')
elif noise_type == "s&p":
s_vs_p = 0.5
amount = 0.004
out = image
# Generate Salt '1' noise
num_salt = np.ceil(amount * image.size * s_vs_p)
coords = [np.random.randint(0, i - 1, int(num_salt))
for i in image.shape]
out[coords] = 255
# Generate Pepper '0' noise
num_pepper = np.ceil(amount * image.size * (1. - s_vs_p))
coords = [np.random.randint(0, i - 1, int(num_pepper))
for i in image.shape]
out[coords] = 0
return out
elif noise_type == "poisson":
vals = len(np.unique(image))
vals = 2 ** np.ceil(np.log2(vals))
noisy = np.random.poisson(image * vals) / float(vals)
return noisy
elif noise_type == "speckle":
gauss = np.random.randn(row, col, ch)
gauss = gauss.reshape(row, col, ch)
noisy = image + image * gauss
return noisy
else:
return image


def addNoiseAndGray(surf):
# https://stackoverflow.com/questions/34673424/how-to-get-numpy-array-of-rgb-colors-from-pygame-surface
imgdata = pygame.surfarray.array3d(surf)
imgdata = imgdata.swapaxes(0, 1)
# print('imgdata shape %s' % imgdata.shape) # shall be IMG_HEIGHT * IMG_WIDTH
imgdata2 = noise_generator('s&p', imgdata)

img2 = Image.fromarray(np.uint8(imgdata2))
# img2.save('/home/zhichyu/Downloads/2sp.jpg')
grayscale2 = ImageOps.grayscale(img2)
# grayscale2.save('/home/zhichyu/Downloads/2bw2.jpg')
# return grayscale2

array = np.asarray(np.uint8(grayscale2))
# print('array.shape %s' % array.shape)
selem = disk(random.randint(0, 1))
eroded = erosion(array, selem)
return eroded


def generateText(leng):
text = ''
font = random.choice(fontsChinese)
while(1):
phrase = random.choice(chiPhrases)
if len(phrase) >= leng:
maxStartPos = len(phrase)-leng
allowedPos = list(filter(
lambda pos: pos == 0 or ord(phrase[pos]) >= 256 or ord(phrase[pos-1]) >= 256, range(0, maxStartPos+1)))
if len(allowedPos) == 0:
continue
i = random.choice(allowedPos)
text = phrase[i:i+leng]
break
return font, text


def generate(total):
global mainDir, fontsChinese
assert(mainDir is not None)
assert(fontsChinese is not None)
num = 0
while num < total:
while 1:
leng = random.randint(MIN_TEXT_LEN, MAX_TEXT_LEN)
font, text = generateText(leng)
surf, text = putTextAndRotate(text, font)
if surf is None:
continue
img = addNoiseAndGray(surf)
yield img, text
num += 1
break


def main():
global mainDir, fontsChinese
pygame.init()
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
os.makedirs(OUTPUT_DIR)
labels = open(os.path.join(OUTPUT_DIR, "labels.txt"), 'w')
labels.truncate()
i = 0
chiIdx = 0
outDir = None

# http://stackoverflow.com/questions/50499/how-do-i-get-the-path-and-name-of-the-file-that-is-currently-executing
selfPath = os.path.abspath(inspect.getfile(inspect.currentframe()))
mainDir, _ = os.path.split(selfPath)
dirFonts = os.path.join(mainDir, 'fonts_Chinese')
fnFonts = filter(lambda fn: os.path.splitext(fn)[1].lower() in [
'.ttf', '.otf'], os.listdir(dirFonts))
fontsChinese = list(
map(lambda fn: os.path.join(dirFonts, fn), fnFonts))

chiFiles = sorted(glob.glob('newsgroup/corpus-*.txt'))
outputPerChiFile = OUTPUT_NUM / len(chiFiles)
initChineseSource(chiFiles[0])
chiIdx += 1

for im, text in generate(OUTPUT_NUM):
if i % OUTPUT_BATCH == 0:
outDir = os.path.join(OUTPUT_DIR, str(int(i/OUTPUT_BATCH)))
os.makedirs(outDir)
if i != 0 and i % outputPerChiFile == 0:
initChineseSource(chiFiles[chiIdx])
chiIdx += 1
outf = os.path.join(outDir, '%s.jpg' % i)
# pygame.image.save(im, outf) #pygame
# im.save(outf) #PIL image
io.imsave(outf, im) # scikit-image
labels.write('%s/%s.jpg\t%s\n' % (int(i/OUTPUT_BATCH),
i, text))
print('done %s.jpg, text: %s' % (i, text))
i += 1
labels.close()


if __name__ == '__main__':
main()
Loading

0 comments on commit b16e633

Please sign in to comment.