Skip to content

Commit

Permalink
automatically send post via Telegra.ph if msg need to be split
Browse files Browse the repository at this point in the history
(if env.TELEGRAPH_TOKEN set)

Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
  • Loading branch information
Rongronggg9 committed Sep 17, 2021
1 parent e977e7b commit 528c5d7
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 46 deletions.
16 changes: 11 additions & 5 deletions docker-compose.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,18 @@ services:
- MANAGER=1234567890 # 替换为你的 user id
- DELAY=300 # 订阅更新检查间隔(单位:秒)

# <------ 以下为可选参数,取消注释(删除行首的空格)并对应修改来启用它们 ------> #
# - T_PROXY=socks5h://host.docker.internal:1080 # Telegram Bot API 使用的代理,用 host.docker.internal 代替 localhost
# - R_PROXY=socks5h://host.docker.internal:1080 # 获取 RSS 订阅使用的代理, 用 host.docker.internal 代替 localhost
# - DEBUG=1 # 若要开启 debug 日志输出,请取消注释该行
# ↓------ 要启用长文自动转 Telegraph,取消注释(删除 # 号)并替换为你自己申请到的 access_token ------ #
# 在这里申请: https://api.telegra.ph/createAccount?short_name=RSStT&author_name=Generated%20by%20RSStT&author_url=https%3A%2F%2Fgithub.com%2FRongronggg9%2FRSS-to-Telegram-Bot
#- TELEGRAPH_TOKEN=1a23b456c78de90f1a23b456c78de90f1a23b456c78de90f1a23b456c78d
# ↑------ 要启用长文自动转 Telegraph,取消注释(删除 # 号)并替换为你自己申请到的 access_token ------↑ #

# <------ 若不使用 redis 而使用 sqlite,请将此行以下全部注释,注意数据不会被自动迁移 ------> #
# ↓------ 以下为可选参数,取消注释(删除 # 号)并对应修改来启用它们 ------↓ #
#- T_PROXY=socks5h://host.docker.internal:1080 # Telegram Bot API 使用的代理,用 host.docker.internal 代替 localhost
#- R_PROXY=socks5h://host.docker.internal:1080 # 获取 RSS 订阅使用的代理, 用 host.docker.internal 代替 localhost
#- DEBUG=1 # 若要开启 debug 日志输出,请取消注释该行
# ↑------ 以上为可选参数,取消注释(删除 # 号)并对应修改来启用它们 ------↑ #

# ↓------ 若不使用 redis 而使用 sqlite,请将此行以下全部注释,注意数据不会被自动迁移 ------↓ #
- REDISHOST=redis
depends_on:
- redis
Expand Down
4 changes: 4 additions & 0 deletions env.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
if not IMG_RELAY_SERVER.endswith('/'):
IMG_RELAY_SERVER += '/'

TELEGRAPH_TOKEN = os.environ.get('TELEGRAPH_TOKEN')
if TELEGRAM_PROXY and TELEGRAM_PROXY: # enable proxy for telegraph
os.environ['HTTPS_PROXY'] = TELEGRAM_PROXY

REDIS_HOST = os.environ.get('REDISHOST')
REDIS_PORT = int(os.environ.get('REDISPORT', 6379))
REDIS_USER = os.environ.get('REDISUSER')
Expand Down
86 changes: 53 additions & 33 deletions medium.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,11 @@

logger = log.getLogger('RSStT.medium')

# getPic = re.compile(r'<img[^>]*\bsrc="([^"]*)"')
# getVideo = re.compile(r'<video[^>]*\bsrc="([^"]*)"')
# getSize = re.compile(r'^Content-Length: (\d+)$', re.M)
sizes = ['large', 'mw2048', 'mw1024', 'mw720', 'middle']
sizeParser = re.compile(r'(?P<domain>^https?://\w+\.sinaimg\.\S+/)'
r'(?P<size>large|mw2048|mw1024|mw720|middle)'
r'(?P<filename>/\w+\.\w+$)')
serverParser = re.compile(r'(?P<url_prefix>^https?:\/\/[a-zA-Z_-]+)'
serverParser = re.compile(r'(?P<url_prefix>^https?://[a-zA-Z_-]+)'
r'(?P<server_id>\d)'
r'(?P<url_suffix>\.sinaimg\.\S+$)')

Expand Down Expand Up @@ -50,8 +47,10 @@ def _validate(self): # warning: only design for weibo
url = self.url
try:
size, width, height = get_medium_info(url)
if size is None:
raise IOError
except Exception as e:
logger.debug(f'Dropped medium {url}: can not be fetched.')
logger.debug(f'Dropped medium {url}: can not be fetched:' + str(e))
self.valid = False
return

Expand Down Expand Up @@ -126,34 +125,6 @@ def telegramize(self):
return telegram.InputMediaAnimation(self.url) # hmm, you don't need it


def get_medium_info(url):
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=1))
session.mount('https://', HTTPAdapter(max_retries=1))

response = session.get(url, timeout=(5, 5), proxies=env.REQUESTS_PROXIES, stream=True, headers=env.REQUESTS_HEADERS)
size = int(response.headers.get('Content-Length', 256))
content_type = response.headers.get('Content-Type')

height = width = -1
if content_type != 'image/jpeg' and url.find('jpg') == -1 and url.find('jpeg') == -1: # if not jpg
response.close()
return size, width, height

pic_header = response.raw.read(min(256, size))
response.close()
pointer = -1
for marker in (b'\xff\xc2', b'\xff\xc1', b'\xff\xc0'):
p = pic_header.find(marker)
if p != -1:
pointer = p
if pointer != -1:
width = int(pic_header[pointer + 7:pointer + 9].hex(), 16)
height = int(pic_header[pointer + 5:pointer + 7].hex(), 16)

return size, width, height


class Media:
def __init__(self):
self._media: List[Medium] = []
Expand Down Expand Up @@ -197,3 +168,52 @@ def change_all_server(self):
if sum(map(lambda m: m.change_server(), self._media)):
return True
return False

def __len__(self):
return len(self._media)

def __bool__(self):
return bool(self._media)


def get_medium_stream(url, headers: dict = None):
if headers is None:
headers = env.REQUESTS_HEADERS
else:
headers = headers.copy()
headers.update(env.REQUESTS_HEADERS)

session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=1))
session.mount('https://', HTTPAdapter(max_retries=1))
stream = session.get(url, timeout=(5, 5), proxies=env.REQUESTS_PROXIES, stream=True,
headers=headers)
if stream.status_code != 200:
return None
return stream


def get_medium_info(url):
stream = get_medium_stream(url)
if stream is None:
return None, None, None
size = int(stream.headers.get('Content-Length', 256))
content_type = stream.headers.get('Content-Type')

height = width = -1
if content_type != 'image/jpeg' and url.find('jpg') == -1 and url.find('jpeg') == -1: # if not jpg
stream.close()
return size, width, height

pic_header = stream.raw.read(min(256, size))
stream.close()
pointer = -1
for marker in (b'\xff\xc2', b'\xff\xc1', b'\xff\xc0'):
p = pic_header.find(marker)
if p != -1:
pointer = p
if pointer != -1:
width = int(pic_header[pointer + 7:pointer + 9].hex(), 16)
height = int(pic_header[pointer + 5:pointer + 7].hex(), 16)

return size, width, height
9 changes: 8 additions & 1 deletion message.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@ def _send(self, chat_id: Union[str, int], reply_to_msg_id: int = None):


class TextMsg(Message):
disable_preview = True

@fasteners.lock.read_locked
def _send(self, chat_id: Union[str, int], reply_to_msg_id: int = None):
env.bot.send_message(chat_id, self.text, parse_mode=self.parse_mode, disable_web_page_preview=True,
env.bot.send_message(chat_id, self.text, parse_mode=self.parse_mode,
disable_web_page_preview=self.disable_preview,
reply_to_message_id=reply_to_msg_id, allow_sending_without_reply=True)


Expand Down Expand Up @@ -97,3 +100,7 @@ def _send(self, chat_id: Union[str, int], reply_to_msg_id: int = None):

class BotServiceMsg(TextMsg):
no_retry = True


class TelegraphMsg(TextMsg):
disable_preview = False
85 changes: 79 additions & 6 deletions post.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
import re
import traceback

import requests
import telegram.error
import telegraph
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from typing import Optional, Union, List
Expand All @@ -27,6 +30,13 @@
stripLineEnd = re.compile(r'[ \t\xa0]+\n')
isEmoticon = re.compile(r'(width|height): ?[012]?\dpx')

telegraph_api = telegraph.Telegraph(access_token=env.TELEGRAPH_TOKEN) if env.TOKEN else None
TELEGRAPH_ALLOWED_TAGS = {
'a', 'aside', 'b', 'blockquote', 'br', 'code', 'em', 'figcaption', 'figure',
'h3', 'h4', 'hr', 'i', 'iframe', 'img', 'li', 'ol', 'p', 'pre', 's',
'strong', 'u', 'ul', 'video'
}

# load emoji dict
with open('emojify.json', 'r', encoding='utf-8') as emojify_json:
emoji_dict = json.load(emojify_json)
Expand Down Expand Up @@ -61,22 +71,27 @@ def __init__(self,
link: Optional[str] = None,
author: Optional[str] = None,
plain: bool = False,
service_msg: bool = False):
service_msg: bool = False,
telegraph_url: str = None):
"""
:param xml: post content (xml or html)
:param title: post title
:param feed_title: feed title
:param link: post link
:param author: post author
:param plain: do not need to be parsed?
:param plain: do not need to add metadata?
:param service_msg: is this post a bot service msg?
:param telegraph_url: if set, a telegraph post will be sent
"""
self.retries = 0
xml = xml.replace('\n', '')
xml = emojify(xml)
self.xml = xml
self.soup = BeautifulSoup(xml, 'lxml')
self.media: Media = Media()
self.text = Text(self._get_item(self.soup))
self.service_msg = service_msg
self.telegraph_url = telegraph_url
self.messages: Optional[List[message.Message]] = None
self.origin_text = self.text.copy()
if plain:
Expand All @@ -91,6 +106,15 @@ def __init__(self,
def send_message(self, chat_ids: Union[List[Union[str, int]], str, int], reply_to_msg_id: int = None):
if type(chat_ids) is not list:
chat_ids = [chat_ids]

# send telegraph post
if telegraph_api and not self.service_msg and not self.telegraph_url \
and len(self.text) >= (1024 if self.media else 4096):
logger.info('This post will be sent via Telegraph.')
if self.telegraph_ify(chat_ids, reply_to_msg_id): # telegraph post sent successful
return
logger.warning('This post cannot be sent via Telegraph, fallback to normal message...')

if not self.messages:
self.generate_message()
message_count = len(self.messages)
Expand All @@ -103,6 +127,8 @@ def send_message(self, chat_ids: Union[List[Union[str, int]], str, int], reply_t
msg.send(chat_id, reply_to_msg_id)
user_success_count += 1
tot_success_count += 1
except OverflowError:
return # retried too many times
except telegram.error.BadRequest as e:
error_caption = e.message
if error_caption.startswith('Have no rights to send a message'):
Expand Down Expand Up @@ -140,6 +166,42 @@ def send_message(self, chat_ids: Union[List[Union[str, int]], str, int], reply_t

chat_ids.pop(0)

def telegraph_ify(self, chat_ids: Union[List[Union[str, int]], str, int], reply_to_msg_id: int = None):
for tag in self.soup.find_all(recursive=True):
if tag.name not in TELEGRAPH_ALLOWED_TAGS:
tag.replaceWithChildren()

if self.feed_title:
telegraph_author = f"{self.feed_title}"
if self.author and self.author not in self.feed_title:
telegraph_author += f' ({self.author})'
telegraph_author_url = self.link if self.link else ''
else:
telegraph_author = 'Generated by RSStT'
telegraph_author_url = 'https://github.com/Rongronggg9/RSS-to-Telegram-Bot'

telegraph_title = f'{self.title} - {telegraph_author}' if self.title else 'Generated by RSStT'
telegraph_html_content = (str(self.soup) +
"<br><br>Generated by "
"<a href='https://github.com/Rongronggg9/RSS-to-Telegram-Bot'>RSStT</a>, "
"The copyright belongs to the source site." +
f"<br><br><a href='{self.link}'>Source</a>" if self.link else '')

try:
telegraph_page = \
telegraph_api.create_page(title=telegraph_title[:256], html_content=telegraph_html_content,
author_name=telegraph_author[:128],
author_url=telegraph_author_url[:512])
except telegraph.TelegraphException as e:
logger.warning('Telegraph API error:' + str(e))
return False

telegraph_url = f"https://telegra.ph/{telegraph_page['path']}"
telegraph_post = Post(xml='', title=self.title, feed_title=self.feed_title,
link=self.link, author=self.author, telegraph_url=telegraph_url)
telegraph_post.send_message(chat_ids, reply_to_msg_id)
return True

def generate_pure_message(self):
self.text = Text('Content decoding failed!\n内容解码失败!')
self._add_metadata()
Expand All @@ -152,6 +214,11 @@ def generate_message(self):
self.messages = [message.BotServiceMsg(text) for text in self.get_split_html(4096)]
return

# Telegraph msg
if self.telegraph_url:
self.messages = [message.TelegraphMsg(text) for text in self.get_split_html(4096)]
return

media_tuple = tuple(self.media.get_valid_media())
media_msg_count = len(media_tuple)

Expand Down Expand Up @@ -197,18 +264,24 @@ def get_split_html(self, length_limit_head: int, head_count: int = -1, length_li

def _add_metadata(self):
plain_text = self.text.get_html(plain=True)
if self.feed_title:
author = self.author if self.author and self.author not in self.feed_title else None
self._add_via(self.feed_title, self.link, author)
if self.telegraph_url:
self._add_title(self.title)
return
if self.title and ('微博' not in self.feed_title or env.DEBUG):
title_tbc = self.title.replace('[图片]', '').replace('[视频]', '').strip().rstrip('.…')
similarity = fuzz.partial_ratio(title_tbc, plain_text[0:len(self.title) + 10])
logger.debug(f'{self.title} ({self.link}) is {similarity}% likely to be of no title.')
if similarity < 90:
self._add_title(self.title)
if self.feed_title:
author = self.author if self.author and self.author not in self.feed_title else None
self._add_via(self.feed_title, self.link, author)

def _add_title(self, title: str):
text_title = Text([Bold(Underline(title)), Br(), Br()])
if self.telegraph_url:
title = Link(title, param=self.telegraph_url)
title = Bold(Underline(title))
text_title = Text([title, Br(), Br()])
if self.text.is_listed():
self.text.content.insert(0, text_title)
return
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ redis==3.5.3
listparser==0.18
lxml==4.6.3
fasteners==0.16.3
colorlog==6.4.1
colorlog==6.4.1
telegraph==1.4.1

0 comments on commit 528c5d7

Please sign in to comment.