automatically send post via Telegra.ph if msg need to be split

(if env.TELEGRAPH_TOKEN set) Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
xlmm · Sep 17, 2021 · 528c5d7 · 528c5d7
1 parent e977e7b
commit 528c5d7
Show file tree

Hide file tree

Showing 6 changed files with 157 additions and 46 deletions.
diff --git a/docker-compose.yml.sample b/docker-compose.yml.sample
@@ -24,12 +24,18 @@ services:
       - MANAGER=1234567890  # 替换为你的 user id
       - DELAY=300  # 订阅更新检查间隔（单位：秒）
 
-      # <------ 以下为可选参数，取消注释（删除行首的空格）并对应修改来启用它们 ------> #
-#      - T_PROXY=socks5h://host.docker.internal:1080  # Telegram Bot API 使用的代理，用 host.docker.internal 代替 localhost
-#      - R_PROXY=socks5h://host.docker.internal:1080  # 获取 RSS 订阅使用的代理， 用 host.docker.internal 代替 localhost
-#      - DEBUG=1  # 若要开启 debug 日志输出，请取消注释该行
+# ↓------ 要启用长文自动转 Telegraph，取消注释（删除 # 号）并替换为你自己申请到的 access_token ------↓ #
+# 在这里申请: https://api.telegra.ph/createAccount?short_name=RSStT&author_name=Generated%20by%20RSStT&author_url=https%3A%2F%2Fgithub.com%2FRongronggg9%2FRSS-to-Telegram-Bot
+      #- TELEGRAPH_TOKEN=1a23b456c78de90f1a23b456c78de90f1a23b456c78de90f1a23b456c78d
+# ↑------ 要启用长文自动转 Telegraph，取消注释（删除 # 号）并替换为你自己申请到的 access_token ------↑ #
 
-      # <------ 若不使用 redis 而使用 sqlite，请将此行以下全部注释，注意数据不会被自动迁移 ------> #
+# ↓------ 以下为可选参数，取消注释（删除 # 号）并对应修改来启用它们 ------↓ #
+      #- T_PROXY=socks5h://host.docker.internal:1080  # Telegram Bot API 使用的代理，用 host.docker.internal 代替 localhost
+      #- R_PROXY=socks5h://host.docker.internal:1080  # 获取 RSS 订阅使用的代理， 用 host.docker.internal 代替 localhost
+      #- DEBUG=1  # 若要开启 debug 日志输出，请取消注释该行
+# ↑------ 以上为可选参数，取消注释（删除 # 号）并对应修改来启用它们 ------↑ #
+
+# ↓------ 若不使用 redis 而使用 sqlite，请将此行以下全部注释，注意数据不会被自动迁移 ------↓ #
       - REDISHOST=redis
     depends_on:
       - redis

diff --git a/env.py b/env.py
@@ -26,6 +26,10 @@
 if not IMG_RELAY_SERVER.endswith('/'):
     IMG_RELAY_SERVER += '/'
 
+TELEGRAPH_TOKEN = os.environ.get('TELEGRAPH_TOKEN')
+if TELEGRAM_PROXY and TELEGRAM_PROXY:  # enable proxy for telegraph
+    os.environ['HTTPS_PROXY'] = TELEGRAM_PROXY
+
 REDIS_HOST = os.environ.get('REDISHOST')
 REDIS_PORT = int(os.environ.get('REDISPORT', 6379))
 REDIS_USER = os.environ.get('REDISUSER')

diff --git a/medium.py b/medium.py
@@ -10,14 +10,11 @@
 
 logger = log.getLogger('RSStT.medium')
 
-# getPic = re.compile(r'<img[^>]*\bsrc="([^"]*)"')
-# getVideo = re.compile(r'<video[^>]*\bsrc="([^"]*)"')
-# getSize = re.compile(r'^Content-Length: (\d+)$', re.M)
 sizes = ['large', 'mw2048', 'mw1024', 'mw720', 'middle']
 sizeParser = re.compile(r'(?P<domain>^https?://\w+\.sinaimg\.\S+/)'
                         r'(?P<size>large|mw2048|mw1024|mw720|middle)'
                         r'(?P<filename>/\w+\.\w+$)')
-serverParser = re.compile(r'(?P<url_prefix>^https?:\/\/[a-zA-Z_-]+)'
+serverParser = re.compile(r'(?P<url_prefix>^https?://[a-zA-Z_-]+)'
                           r'(?P<server_id>\d)'
                           r'(?P<url_suffix>\.sinaimg\.\S+$)')
 
@@ -50,8 +47,10 @@ def _validate(self):  # warning: only design for weibo
         url = self.url
         try:
             size, width, height = get_medium_info(url)
+            if size is None:
+                raise IOError
         except Exception as e:
-            logger.debug(f'Dropped medium {url}: can not be fetched.')
+            logger.debug(f'Dropped medium {url}: can not be fetched:' + str(e))
             self.valid = False
             return
 
@@ -126,34 +125,6 @@ def telegramize(self):
         return telegram.InputMediaAnimation(self.url)  # hmm, you don't need it
 
 
-def get_medium_info(url):
-    session = requests.Session()
-    session.mount('http://', HTTPAdapter(max_retries=1))
-    session.mount('https://', HTTPAdapter(max_retries=1))
-
-    response = session.get(url, timeout=(5, 5), proxies=env.REQUESTS_PROXIES, stream=True, headers=env.REQUESTS_HEADERS)
-    size = int(response.headers.get('Content-Length', 256))
-    content_type = response.headers.get('Content-Type')
-
-    height = width = -1
-    if content_type != 'image/jpeg' and url.find('jpg') == -1 and url.find('jpeg') == -1:  # if not jpg
-        response.close()
-        return size, width, height
-
-    pic_header = response.raw.read(min(256, size))
-    response.close()
-    pointer = -1
-    for marker in (b'\xff\xc2', b'\xff\xc1', b'\xff\xc0'):
-        p = pic_header.find(marker)
-        if p != -1:
-            pointer = p
-    if pointer != -1:
-        width = int(pic_header[pointer + 7:pointer + 9].hex(), 16)
-        height = int(pic_header[pointer + 5:pointer + 7].hex(), 16)
-
-    return size, width, height
-
-
 class Media:
     def __init__(self):
         self._media: List[Medium] = []
@@ -197,3 +168,52 @@ def change_all_server(self):
         if sum(map(lambda m: m.change_server(), self._media)):
             return True
         return False
+
+    def __len__(self):
+        return len(self._media)
+
+    def __bool__(self):
+        return bool(self._media)
+
+
+def get_medium_stream(url, headers: dict = None):
+    if headers is None:
+        headers = env.REQUESTS_HEADERS
+    else:
+        headers = headers.copy()
+        headers.update(env.REQUESTS_HEADERS)
+
+    session = requests.Session()
+    session.mount('http://', HTTPAdapter(max_retries=1))
+    session.mount('https://', HTTPAdapter(max_retries=1))
+    stream = session.get(url, timeout=(5, 5), proxies=env.REQUESTS_PROXIES, stream=True,
+                         headers=headers)
+    if stream.status_code != 200:
+        return None
+    return stream
+
+
+def get_medium_info(url):
+    stream = get_medium_stream(url)
+    if stream is None:
+        return None, None, None
+    size = int(stream.headers.get('Content-Length', 256))
+    content_type = stream.headers.get('Content-Type')
+
+    height = width = -1
+    if content_type != 'image/jpeg' and url.find('jpg') == -1 and url.find('jpeg') == -1:  # if not jpg
+        stream.close()
+        return size, width, height
+
+    pic_header = stream.raw.read(min(256, size))
+    stream.close()
+    pointer = -1
+    for marker in (b'\xff\xc2', b'\xff\xc1', b'\xff\xc0'):
+        p = pic_header.find(marker)
+        if p != -1:
+            pointer = p
+    if pointer != -1:
+        width = int(pic_header[pointer + 7:pointer + 9].hex(), 16)
+        height = int(pic_header[pointer + 5:pointer + 7].hex(), 16)
+
+    return size, width, height
diff --git a/message.py b/message.py
@@ -58,9 +58,12 @@ def _send(self, chat_id: Union[str, int], reply_to_msg_id: int = None):
 
 
 class TextMsg(Message):
+    disable_preview = True
+
     @fasteners.lock.read_locked
     def _send(self, chat_id: Union[str, int], reply_to_msg_id: int = None):
-        env.bot.send_message(chat_id, self.text, parse_mode=self.parse_mode, disable_web_page_preview=True,
+        env.bot.send_message(chat_id, self.text, parse_mode=self.parse_mode,
+                             disable_web_page_preview=self.disable_preview,
                              reply_to_message_id=reply_to_msg_id, allow_sending_without_reply=True)
 
 
@@ -97,3 +100,7 @@ def _send(self, chat_id: Union[str, int], reply_to_msg_id: int = None):
 
 class BotServiceMsg(TextMsg):
     no_retry = True
+
+
+class TelegraphMsg(TextMsg):
+    disable_preview = False
diff --git a/post.py b/post.py
@@ -1,7 +1,10 @@
 import json
 import re
 import traceback
+
+import requests
 import telegram.error
+import telegraph
 from bs4 import BeautifulSoup
 from bs4.element import NavigableString
 from typing import Optional, Union, List
@@ -27,6 +30,13 @@
 stripLineEnd = re.compile(r'[ \t\xa0]+\n')
 isEmoticon = re.compile(r'(width|height): ?[012]?\dpx')
 
+telegraph_api = telegraph.Telegraph(access_token=env.TELEGRAPH_TOKEN) if env.TOKEN else None
+TELEGRAPH_ALLOWED_TAGS = {
+    'a', 'aside', 'b', 'blockquote', 'br', 'code', 'em', 'figcaption', 'figure',
+    'h3', 'h4', 'hr', 'i', 'iframe', 'img', 'li', 'ol', 'p', 'pre', 's',
+    'strong', 'u', 'ul', 'video'
+}
+
 # load emoji dict
 with open('emojify.json', 'r', encoding='utf-8') as emojify_json:
     emoji_dict = json.load(emojify_json)
@@ -61,22 +71,27 @@ def __init__(self,
                  link: Optional[str] = None,
                  author: Optional[str] = None,
                  plain: bool = False,
-                 service_msg: bool = False):
+                 service_msg: bool = False,
+                 telegraph_url: str = None):
         """
         :param xml: post content (xml or html)
         :param title: post title
         :param feed_title: feed title
         :param link: post link
         :param author: post author
-        :param plain: do not need to be parsed?
+        :param plain: do not need to add metadata?
+        :param service_msg: is this post a bot service msg?
+        :param telegraph_url: if set, a telegraph post will be sent
         """
         self.retries = 0
         xml = xml.replace('\n', '')
         xml = emojify(xml)
+        self.xml = xml
         self.soup = BeautifulSoup(xml, 'lxml')
         self.media: Media = Media()
         self.text = Text(self._get_item(self.soup))
         self.service_msg = service_msg
+        self.telegraph_url = telegraph_url
         self.messages: Optional[List[message.Message]] = None
         self.origin_text = self.text.copy()
         if plain:
@@ -91,6 +106,15 @@ def __init__(self,
     def send_message(self, chat_ids: Union[List[Union[str, int]], str, int], reply_to_msg_id: int = None):
         if type(chat_ids) is not list:
             chat_ids = [chat_ids]
+
+        # send telegraph post
+        if telegraph_api and not self.service_msg and not self.telegraph_url \
+                and len(self.text) >= (1024 if self.media else 4096):
+            logger.info('This post will be sent via Telegraph.')
+            if self.telegraph_ify(chat_ids, reply_to_msg_id):  # telegraph post sent successful
+                return
+            logger.warning('This post cannot be sent via Telegraph, fallback to normal message...')
+
         if not self.messages:
             self.generate_message()
         message_count = len(self.messages)
@@ -103,6 +127,8 @@ def send_message(self, chat_ids: Union[List[Union[str, int]], str, int], reply_t
                     msg.send(chat_id, reply_to_msg_id)
                     user_success_count += 1
                     tot_success_count += 1
+                except OverflowError:
+                    return  # retried too many times
                 except telegram.error.BadRequest as e:
                     error_caption = e.message
                     if error_caption.startswith('Have no rights to send a message'):
@@ -140,6 +166,42 @@ def send_message(self, chat_ids: Union[List[Union[str, int]], str, int], reply_t
 
             chat_ids.pop(0)
 
+    def telegraph_ify(self, chat_ids: Union[List[Union[str, int]], str, int], reply_to_msg_id: int = None):
+        for tag in self.soup.find_all(recursive=True):
+            if tag.name not in TELEGRAPH_ALLOWED_TAGS:
+                tag.replaceWithChildren()
+
+        if self.feed_title:
+            telegraph_author = f"{self.feed_title}"
+            if self.author and self.author not in self.feed_title:
+                telegraph_author += f' ({self.author})'
+            telegraph_author_url = self.link if self.link else ''
+        else:
+            telegraph_author = 'Generated by RSStT'
+            telegraph_author_url = 'https://github.com/Rongronggg9/RSS-to-Telegram-Bot'
+
+        telegraph_title = f'{self.title} - {telegraph_author}' if self.title else 'Generated by RSStT'
+        telegraph_html_content = (str(self.soup) +
+                                  "<br><br>Generated by "
+                                  "<a href='https://github.com/Rongronggg9/RSS-to-Telegram-Bot'>RSStT</a>, "
+                                  "The copyright belongs to the source site." +
+                                  f"<br><br><a href='{self.link}'>Source</a>" if self.link else '')
+
+        try:
+            telegraph_page = \
+                telegraph_api.create_page(title=telegraph_title[:256], html_content=telegraph_html_content,
+                                          author_name=telegraph_author[:128],
+                                          author_url=telegraph_author_url[:512])
+        except telegraph.TelegraphException as e:
+            logger.warning('Telegraph API error:' + str(e))
+            return False
+
+        telegraph_url = f"https://telegra.ph/{telegraph_page['path']}"
+        telegraph_post = Post(xml='', title=self.title, feed_title=self.feed_title,
+                              link=self.link, author=self.author, telegraph_url=telegraph_url)
+        telegraph_post.send_message(chat_ids, reply_to_msg_id)
+        return True
+
     def generate_pure_message(self):
         self.text = Text('Content decoding failed!\n内容解码失败！')
         self._add_metadata()
@@ -152,6 +214,11 @@ def generate_message(self):
             self.messages = [message.BotServiceMsg(text) for text in self.get_split_html(4096)]
             return
 
+        # Telegraph msg
+        if self.telegraph_url:
+            self.messages = [message.TelegraphMsg(text) for text in self.get_split_html(4096)]
+            return
+
         media_tuple = tuple(self.media.get_valid_media())
         media_msg_count = len(media_tuple)
 
@@ -197,18 +264,24 @@ def get_split_html(self, length_limit_head: int, head_count: int = -1, length_li
 
     def _add_metadata(self):
         plain_text = self.text.get_html(plain=True)
+        if self.feed_title:
+            author = self.author if self.author and self.author not in self.feed_title else None
+            self._add_via(self.feed_title, self.link, author)
+        if self.telegraph_url:
+            self._add_title(self.title)
+            return
         if self.title and ('微博' not in self.feed_title or env.DEBUG):
             title_tbc = self.title.replace('[图片]', '').replace('[视频]', '').strip().rstrip('.…')
             similarity = fuzz.partial_ratio(title_tbc, plain_text[0:len(self.title) + 10])
             logger.debug(f'{self.title} ({self.link}) is {similarity}% likely to be of no title.')
             if similarity < 90:
                 self._add_title(self.title)
-        if self.feed_title:
-            author = self.author if self.author and self.author not in self.feed_title else None
-            self._add_via(self.feed_title, self.link, author)
 
     def _add_title(self, title: str):
-        text_title = Text([Bold(Underline(title)), Br(), Br()])
+        if self.telegraph_url:
+            title = Link(title, param=self.telegraph_url)
+        title = Bold(Underline(title))
+        text_title = Text([title, Br(), Br()])
         if self.text.is_listed():
             self.text.content.insert(0, text_title)
             return

diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,5 @@ redis==3.5.3
 listparser==0.18
 lxml==4.6.3
 fasteners==0.16.3
-colorlog==6.4.1
+colorlog==6.4.1
+telegraph==1.4.1