Skip to content

Commit

Permalink
fix: code format (yihong0618#181)
Browse files Browse the repository at this point in the history
* fix: code format

* fix: lint
  • Loading branch information
yihong0618 authored Mar 17, 2023
1 parent 6195ede commit 529749e
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 123 deletions.
14 changes: 9 additions & 5 deletions book_maker/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ def main():
"--batch_size",
dest="batch_size",
type=int,
default=10,
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
)

Expand Down Expand Up @@ -260,12 +259,17 @@ def main():
model_api_base=model_api_base,
is_test=options.test,
test_num=options.test_num,
translate_tags=options.translate_tags,
allow_navigable_strings=options.allow_navigable_strings,
accumulated_num=options.accumulated_num,
prompt_config=parse_prompt_arg(options.prompt_arg),
batch_size=options.batch_size,
)
# other options
if options.allow_navigable_strings:
e.allow_navigable_strings = True
if options.translate_tags:
e.translate_tags = options.translate_tags
if options.accumulated_num > 1:
e.accumulated_num = options.accumulated_num
if options.batch_size:
e.batch_size = options.batch_size
e.make_bilingual_book()


Expand Down
136 changes: 28 additions & 108 deletions book_maker/loader/epub_loader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
import re
import pickle
import tiktoken
import string
import sys
from copy import copy
from pathlib import Path
Expand All @@ -12,98 +11,17 @@
from rich import print
from tqdm import tqdm

from book_maker.utils import prompt_config_to_kwargs
from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs

from .base_loader import BaseBookLoader


class EPUBBookLoaderHelper:
def __init__(self, translate_model, accumulated_num):
self.translate_model = translate_model
self.accumulated_num = accumulated_num

def deal_new(self, p, wait_p_list):
self.deal_old(wait_p_list)
new_p = copy(p)
new_p.string = self.translate_model.translate(p.text)
p.insert_after(new_p)

def deal_old(self, wait_p_list):
if len(wait_p_list) == 0:
return

result_txt_list = self.translate_model.translate_list(wait_p_list)

for i in range(len(wait_p_list)):
if i < len(result_txt_list):
p = wait_p_list[i]
new_p = copy(p)
new_p.string = result_txt_list[i]
p.insert_after(new_p)

wait_p_list.clear()


# ref: https://platform.openai.com/docs/guides/chat/introduction
def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
messages = (
{
"role": "user",
"content": text,
},
)

"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
raise NotImplementedError(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)


def is_link(text):
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
return bool(url_pattern.match(text.strip()))


def is_tail_Link(text, num=100):
text = text.strip()
url_pattern = re.compile(
r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
)
return bool(url_pattern.match(text)) and len(text) < num


def is_source(text):
return text.strip().startswith("Source: ")


def is_list(text, num=80):
text = text.strip()
return re.match(r"^Listing\s*\d+", text) and len(text) < num


def is_figure(text, num=80):
text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num
from .helper import (
EPUBBookLoaderHelper,
is_text_figure,
is_text_link,
is_text_list,
is_text_source,
is_text_tail_link,
)


class EPUBBookLoader(BaseBookLoader):
Expand All @@ -114,14 +32,9 @@ def __init__(
key,
resume,
language,
batch_size,
model_api_base=None,
is_test=False,
test_num=5,
translate_tags="p",
allow_navigable_strings=False,
accumulated_num=1,
prompt_template=None,
prompt_config=None,
):
self.epub_name = epub_name
Expand All @@ -134,9 +47,9 @@ def __init__(
)
self.is_test = is_test
self.test_num = test_num
self.translate_tags = translate_tags
self.allow_navigable_strings = allow_navigable_strings
self.accumulated_num = accumulated_num
self.translate_tags = "p"
self.allow_navigable_strings = False
self.accumulated_num = 1
self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num)

try:
Expand Down Expand Up @@ -165,7 +78,12 @@ def _load_spine(self):

@staticmethod
def _is_special_text(text):
return text.isdigit() or text.isspace() or is_link(text)
return (
text.isdigit()
or text.isspace()
or is_text_link(text)
or all(char in string.punctuation for char in text)
)

def _make_new_book(self, book):
new_book = epub.EpubBook()
Expand Down Expand Up @@ -206,13 +124,15 @@ def translate_paragraphs_acc(self, p_list, send_num):
temp_p = copy(p)
for sup in temp_p.find_all("sup"):
sup.extract()
if (
not p.text
or self._is_special_text(temp_p.text)
or is_source(temp_p.text)
or is_list(temp_p.text)
or is_figure(temp_p.text)
or is_tail_Link(temp_p.text)
if any(
[
not p.text,
self._is_special_text(temp_p.text),
is_text_source(temp_p.text),
is_text_list(temp_p.text),
is_text_figure(temp_p.text),
is_text_tail_link(temp_p.text),
]
):
if i == len(p_list) - 1:
self.helper.deal_old(wait_p_list)
Expand Down
58 changes: 58 additions & 0 deletions book_maker/loader/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re
from copy import copy


class EPUBBookLoaderHelper:
def __init__(self, translate_model, accumulated_num):
self.translate_model = translate_model
self.accumulated_num = accumulated_num

def deal_new(self, p, wait_p_list):
self.deal_old(wait_p_list)
new_p = copy(p)
new_p.string = self.translate_model.translate(p.text)
p.insert_after(new_p)

def deal_old(self, wait_p_list):
if not wait_p_list:
return

result_txt_list = self.translate_model.translate_list(wait_p_list)

for i in range(len(wait_p_list)):
if i < len(result_txt_list):
p = wait_p_list[i]
new_p = copy(p)
new_p.string = result_txt_list[i]
p.insert_after(new_p)

wait_p_list.clear()


def is_text_link(text):
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
return bool(url_pattern.match(text.strip()))


def is_text_tail_link(text, num=100):
text = text.strip()
url_pattern = re.compile(
r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
)
return bool(url_pattern.match(text)) and len(text) < num


def is_text_source(text):
return text.strip().startswith("Source: ")


def is_text_list(text, num=80):
text = text.strip()
return re.match(r"^Listing\s*\d+", text) and len(text) < num


def is_text_figure(text, num=80):
text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num
7 changes: 1 addition & 6 deletions book_maker/loader/txt_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,9 @@ def __init__(
key,
resume,
language,
batch_size,
translate_tags,
allow_navigable_strings,
model_api_base=None,
is_test=False,
test_num=5,
accumulated_num=1,
prompt_template=None,
prompt_config=None,
):
self.txt_name = txt_name
Expand All @@ -36,7 +31,7 @@ def __init__(
self.bilingual_result = []
self.bilingual_temp_result = []
self.test_num = test_num
self.batch_size = batch_size
self.batch_size = 10

try:
with open(f"{txt_name}", "r", encoding="utf-8") as f:
Expand Down
4 changes: 2 additions & 2 deletions book_maker/translator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from book_maker.translator.caiyun_translator import Caiyun
from book_maker.translator.chatgptapi_translator import ChatGPTAPI
from book_maker.translator.deepl_translator import DeepL
from book_maker.translator.google_translator import Google
from book_maker.translator.gpt3_translator import GPT3
from book_maker.translator.caiyun_translator import Caiyun
from book_maker.translator.deepl_translator import DeepL

MODEL_DICT = {
"chatgptapi": ChatGPTAPI,
Expand Down
2 changes: 1 addition & 1 deletion book_maker/translator/chatgptapi_translator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import time
import re
import time
from copy import copy
from os import environ

Expand Down
3 changes: 2 additions & 1 deletion book_maker/translator/deepl_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

import requests

from book_maker.utils import TO_LANGUAGE_CODE, LANGUAGES
from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE

from .base_translator import Base


Expand Down
35 changes: 35 additions & 0 deletions book_maker/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import tiktoken

# Borrowed from : https://github.com/openai/whisper
LANGUAGES = {
"en": "english",
Expand Down Expand Up @@ -126,3 +128,36 @@ def prompt_config_to_kwargs(prompt_config):
prompt_template=prompt_config.get("user", None),
prompt_sys_msg=prompt_config.get("system", None),
)


# ref: https://platform.openai.com/docs/guides/chat/introduction
def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
messages = (
{
"role": "user",
"content": text,
},
)

"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
raise NotImplementedError(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)

0 comments on commit 529749e

Please sign in to comment.