Add more cases that don't require translation

SakaZulu · Mar 18, 2023 · 79b2d67 · 79b2d67
1 parent 99bcadd
commit 79b2d67
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 17 deletions.
diff --git a/book_maker/loader/epub_loader.py b/book_maker/loader/epub_loader.py
@@ -14,14 +14,7 @@
 from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
 
 from .base_loader import BaseBookLoader
-from .helper import (
-    EPUBBookLoaderHelper,
-    is_text_figure,
-    is_text_link,
-    is_text_list,
-    is_text_source,
-    is_text_tail_link,
-)
+from .helper import EPUBBookLoaderHelper, not_trans, is_text_link
 
 
 class EPUBBookLoader(BaseBookLoader):
@@ -144,14 +137,7 @@ def translate_paragraphs_acc(self, p_list, send_num):
             for sup in temp_p.find_all("sup"):
                 sup.extract()
             if any(
-                [
-                    not p.text,
-                    self._is_special_text(temp_p.text),
-                    is_text_source(temp_p.text),
-                    is_text_list(temp_p.text),
-                    is_text_figure(temp_p.text),
-                    is_text_tail_link(temp_p.text),
-                ]
+                [not p.text, self._is_special_text(temp_p.text), not_trans(temp_p.text)]
             ):
                 if i == len(p_list) - 1:
                     self.helper.deal_old(wait_p_list)

diff --git a/book_maker/loader/helper.py b/book_maker/loader/helper.py
@@ -31,7 +31,7 @@ def deal_old(self, wait_p_list):
 
 def is_text_link(text):
     url_pattern = re.compile(
-        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+        r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
     )
     return bool(url_pattern.match(text.strip()))
 
@@ -56,3 +56,29 @@ def is_text_list(text, num=80):
 def is_text_figure(text, num=80):
     text = text.strip()
     return re.match(r"^Figure\s*\d+", text) and len(text) < num
+
+
+def is_text_digit_and_space(s):
+    for c in s:
+        if not c.isdigit() and not c.isspace():
+            return False
+    return True
+
+
+def is_text_isbn(s):
+    pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
+    return bool(re.match(pattern, s))
+
+
+def not_trans(s):
+    return any(
+        [
+            is_text_link(s),
+            is_text_tail_link(s),
+            is_text_source(s),
+            is_text_list(s),
+            is_text_figure(s),
+            is_text_digit_and_space(s),
+            is_text_isbn(s),
+        ]
+    )