Skip to content

Commit

Permalink
feat(parsing/medium): SVG/WebP/non-weibo pic fallback
Browse files Browse the repository at this point in the history
Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>

#45
  • Loading branch information
Rongronggg9 committed Mar 4, 2022
1 parent c728fdf commit 19d2685
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 13 deletions.
1 change: 1 addition & 0 deletions docker-compose.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ services:
#- API_ID=1025907 # get it from https://core.telegram.org/api/obtaining_api_id
#- API_HASH=452b0359b988148995f22ff0f4229750 # get it from https://core.telegram.org/api/obtaining_api_id
#- IMG_RELAY_SERVER=https://images.weserv.nl/?url= # default: https://rsstt-img-relay.rongrong.workers.dev/
#- IMAGES_WESERV_NL=https://images.weserv.nl/ # default: https://images.weserv.nl/
#- USER_AGENT=Mozilla/5.0 (Android 12; Mobile; rv:68.0) Gecko/68.0 Firefox/96.0 # default: RSStT/2.0 RSS Reader
#- IPV6_PRIOR=1 # default: 0
#- T_PROXY=socks5://172.17.0.1:1080 # Proxy used to connect to the Telegram API
Expand Down
1 change: 1 addition & 0 deletions docs/advanced-settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
| `MULTIUSER` | Enable multi-user feature or not? | `0` | `1` |
| `CRON_SECOND` | Run the feed monitoring task at the n-th second of each minute? (0-59) | `30` | `0` |
| `IMG_RELAY_SERVER` | Media relay server URL | `https://images.weserv.nl/?url=` | `https://rsstt-img-relay.rongrong.workers.dev/` |
| `IMAGES_WESERV_NL` | images.weserv.nl URL | `https://images.weserv.nl/` | `https://images.weserv.nl/` |
| `DATABASE_URL` | Database URL [^5] | `postgres://user:pass@example.com:5432/table` | `sqlite://config/db.sqlite3?journal_mode=OFF` |
| `DEBUG` | Enable debug logging or not? | `1` | `0` |

Expand Down
11 changes: 10 additions & 1 deletion src/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,18 @@ def __list_parser(var: Optional[str]) -> list[str]:

# ----- img relay server config -----
_img_relay_server = os.environ.get('IMG_RELAY_SERVER') or 'https://rsstt-img-relay.rongrong.workers.dev/'
IMG_RELAY_SERVER: Final = _img_relay_server + ('' if _img_relay_server.endswith(('/', '=')) else '/')
IMG_RELAY_SERVER: Final = ('https://' if not _img_relay_server.startswith('http') else '') \
+ _img_relay_server \
+ ('' if _img_relay_server.endswith(('/', '=')) else '/')
del _img_relay_server

# ----- images.weserv.nl config -----
_images_weserv_nl = os.environ.get('IMAGES_WESERV_NL') or 'https://images.weserv.nl/'
IMAGES_WESERV_NL: Final = ('https://' if not _images_weserv_nl.startswith('http') else '') \
+ _images_weserv_nl \
+ ('' if _images_weserv_nl.endswith('/') else '/')
del _images_weserv_nl

# ----- db config -----
_database_url = os.environ.get('DATABASE_URL') or 'sqlite://config/db.sqlite3?journal_mode=OFF'
DATABASE_URL: Final = (_database_url.replace('postgresql', 'postgres', 1) if _database_url.startswith('postgresql')
Expand Down
19 changes: 15 additions & 4 deletions src/parsing/html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from attr import define

from src import web
from .medium import Video, Image, Media, Animation, Audio
from .medium import Video, Image, Media, Animation, Audio, construct_images_weserv_nl_url
from .html_node import *
from .utils import stripNewline, stripLineEnd, is_absolute_link, emojify

Expand Down Expand Up @@ -156,16 +156,27 @@ async def _parse_item(self, soup: Union[PageElement, BeautifulSoup, Tag, Navigab
_multi_src.append(src) if src else None
multi_src = []
is_gif = False
is_webp = False
for _src in _multi_src:
if not isinstance(_src, str):
continue
if not is_absolute_link(_src) and self.feed_link:
_src = urljoin(self.feed_link, _src)
if urlparse(_src).path.endswith(('.gif', '.gifv', '.webm', '.mp4', '.m4v', '.webp')):
path = urlparse(_src).path
if path.endswith(('.gif', '.gifv', '.webm', '.mp4', '.m4v')):
is_gif = True
if path.endswith('.webp'):
is_webp = True
multi_src.append(_src)
if multi_src:
self.media.add(Image(multi_src) if not is_gif else Animation(multi_src))
if is_webp:
media = Image(multi_src)
media.urls = [construct_images_weserv_nl_url(multi_src[0])]
self.media.add(media)
elif is_gif:
self.media.add(Animation(multi_src))
else:
self.media.add(Image(multi_src))
return None

if tag == 'video':
Expand Down Expand Up @@ -223,7 +234,7 @@ async def _parse_item(self, soup: Union[PageElement, BeautifulSoup, Tag, Navigab
# noinspection PyBroadException
try:
page = await web.get(src, timeout=3, decode=True, semaphore=False)
if page.status != 200:
if page.status != 200 or not page.content:
raise ValueError
text = BeautifulSoup(page.content, 'lxml').title.text
except Exception:
Expand Down
40 changes: 33 additions & 7 deletions src/parsing/medium.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
MessageMediaPhoto, MessageMediaDocument
from telethon.errors import FloodWaitError, SlowModeWaitError, ServerError
from asyncstdlib.functools import lru_cache
from urllib.parse import urlencode

from src import env, log, web, locks
from src.parsing.html_node import Link, Br, Text, HtmlTree
Expand Down Expand Up @@ -226,17 +227,23 @@ async def validate(self, flush: bool = False) -> bool:
async with self.validating_lock:
while self.urls:
url = self.urls.pop(0)
if url.startswith(env.IMAGES_WESERV_NL):
self.valid = True
self.chosen_url = url
self._server_change_count = 0
return True
medium_info = await get_medium_info(url)
if medium_info is None:
continue
self.size, self.width, self.height, self.content_type = medium_info

if self.type == IMAGE:
# drop SVG
if self.content_type and self.content_type.lower().startswith('image/svg'):
self.valid = False
self.drop_silently = True
return False
# force convert WEBP/SVG to PNG
if self.content_type and self.content_type.find('webp') != -1 \
or self.content_type.startswith('application') or self.content_type.find('svg') != -1:
self.valid = True
url = construct_images_weserv_nl_url(self.original_urls[0])
self.urls = [url]
# always invalid
elif self.width + self.height > 10000 or self.size > self.maxSize:
self.valid = False
Expand Down Expand Up @@ -268,8 +275,6 @@ async def validate(self, flush: bool = False) -> bool:
await self.change_server()
return True

# TODO: reduce non-weibo pic size

self.valid = False
return await self.type_fallback()

Expand Down Expand Up @@ -377,9 +382,12 @@ class Image(Medium):
def __init__(self, url: Union[str, list[str]]):
super().__init__(url)
new_urls = []
already_images_weserv_nl = False
for url in self.urls:
sinaimg_match = sinaimg_size_parser(url)
pixiv_match = pixiv_size_parser(url)
if url.startswith(env.IMAGES_WESERV_NL):
already_images_weserv_nl = True
if not any([sinaimg_match, pixiv_match]):
new_urls.append(url)
continue
Expand All @@ -400,6 +408,8 @@ def __init__(self, url: Union[str, list[str]]):
if url not in new_urls:
new_urls.append(url)
self.urls = new_urls
if not already_images_weserv_nl:
self.urls.append(construct_images_weserv_nl_url(self.urls[0])) # use for final fallback

async def change_server(self):
sinaimg_server_match = sinaimg_server_parser(self.chosen_url)
Expand Down Expand Up @@ -637,6 +647,21 @@ def hash(self):
return '|'.join(medium.hash for medium in self._media)


def construct_images_weserv_nl_url(url: str,
width: int = 1280,
height: int = 1280,
fit: str = 'inside',
output_format: str = 'png') -> str:
query_string = urlencode({
'url': url,
'w': width,
'h': height,
'fit': fit,
'output': output_format,
})
return env.IMAGES_WESERV_NL + '?' + query_string


@lru_cache(maxsize=1024)
async def get_medium_info(url: str) -> Optional[tuple[int, int, int, Optional[str]]]:
if url.startswith('data:'):
Expand All @@ -651,6 +676,7 @@ async def get_medium_info(url: str) -> Optional[tuple[int, int, int, Optional[st

size = int(r.headers.get('Content-Length') or -1)
content_type = r.headers.get('Content-Type')
content_type = content_type.lower() if content_type else None
is_image = content_type and content_type.startswith('image/')

width = height = -1
Expand Down
5 changes: 4 additions & 1 deletion src/parsing/post_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .splitter import get_plain_text_length
from .html_parser import parse
from .html_node import *
from .medium import Media, Image, Video, Audio, File
from .medium import Media, Image, Video, Audio, File, construct_images_weserv_nl_url

AUTO: Final = 0
DISABLE: Final = -1
Expand Down Expand Up @@ -438,6 +438,9 @@ async def parse_html(self):
continue
elif not enclosure.type:
medium = File(enclosure.url)
elif enclosure.type.find('webp') != -1 or enclosure.type.find('svg') != -1:
medium = Image(enclosure.url)
medium.url = construct_images_weserv_nl_url(enclosure.url)
elif enclosure.type.startswith('image/gif'):
medium = Audio(enclosure.url)
elif enclosure.type.startswith('audio'):
Expand Down

0 comments on commit 19d2685

Please sign in to comment.