Skip to content

Commit

Permalink
Add more cases that don't require translation
Browse files Browse the repository at this point in the history
  • Loading branch information
hleft committed Mar 18, 2023
1 parent 99bcadd commit 79b2d67
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 17 deletions.
18 changes: 2 additions & 16 deletions book_maker/loader/epub_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,7 @@
from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs

from .base_loader import BaseBookLoader
from .helper import (
EPUBBookLoaderHelper,
is_text_figure,
is_text_link,
is_text_list,
is_text_source,
is_text_tail_link,
)
from .helper import EPUBBookLoaderHelper, not_trans, is_text_link


class EPUBBookLoader(BaseBookLoader):
Expand Down Expand Up @@ -144,14 +137,7 @@ def translate_paragraphs_acc(self, p_list, send_num):
for sup in temp_p.find_all("sup"):
sup.extract()
if any(
[
not p.text,
self._is_special_text(temp_p.text),
is_text_source(temp_p.text),
is_text_list(temp_p.text),
is_text_figure(temp_p.text),
is_text_tail_link(temp_p.text),
]
[not p.text, self._is_special_text(temp_p.text), not_trans(temp_p.text)]
):
if i == len(p_list) - 1:
self.helper.deal_old(wait_p_list)
Expand Down
28 changes: 27 additions & 1 deletion book_maker/loader/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def deal_old(self, wait_p_list):

def is_text_link(text):
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
return bool(url_pattern.match(text.strip()))

Expand All @@ -56,3 +56,29 @@ def is_text_list(text, num=80):
def is_text_figure(text, num=80):
text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num


def is_text_digit_and_space(s):
for c in s:
if not c.isdigit() and not c.isspace():
return False
return True


def is_text_isbn(s):
pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
return bool(re.match(pattern, s))


def not_trans(s):
return any(
[
is_text_link(s),
is_text_tail_link(s),
is_text_source(s),
is_text_list(s),
is_text_figure(s),
is_text_digit_and_space(s),
is_text_isbn(s),
]
)

0 comments on commit 79b2d67

Please sign in to comment.