feat(parsing/medium): SVG/WebP/non-weibo pic fallback

Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com> #45
xlmm · Mar 4, 2022 · 19d2685 · 19d2685
1 parent c728fdf
commit 19d2685
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 13 deletions.
diff --git a/docker-compose.yml.sample b/docker-compose.yml.sample
@@ -33,6 +33,7 @@ services:
       #- API_ID=1025907  # get it from https://core.telegram.org/api/obtaining_api_id
       #- API_HASH=452b0359b988148995f22ff0f4229750  # get it from https://core.telegram.org/api/obtaining_api_id
       #- IMG_RELAY_SERVER=https://images.weserv.nl/?url=  # default: https://rsstt-img-relay.rongrong.workers.dev/
+      #- IMAGES_WESERV_NL=https://images.weserv.nl/  # default: https://images.weserv.nl/
       #- USER_AGENT=Mozilla/5.0 (Android 12; Mobile; rv:68.0) Gecko/68.0 Firefox/96.0  # default: RSStT/2.0 RSS Reader
       #- IPV6_PRIOR=1  # default: 0
       #- T_PROXY=socks5://172.17.0.1:1080  # Proxy used to connect to the Telegram API

diff --git a/docs/advanced-settings.md b/docs/advanced-settings.md
@@ -43,6 +43,7 @@
 | `MULTIUSER`        | Enable multi-user feature or not?                                      | `0`                                           | `1`                                             |
 | `CRON_SECOND`      | Run the feed monitoring task at the n-th second of each minute? (0-59) | `30`                                          | `0`                                             |
 | `IMG_RELAY_SERVER` | Media relay server URL                                                 | `https://images.weserv.nl/?url=`              | `https://rsstt-img-relay.rongrong.workers.dev/` |
+| `IMAGES_WESERV_NL` | images.weserv.nl URL                                                   | `https://images.weserv.nl/`                   | `https://images.weserv.nl/`                     |
 | `DATABASE_URL`     | Database URL [^5]                                                      | `postgres://user:pass@example.com:5432/table` | `sqlite://config/db.sqlite3?journal_mode=OFF`   |
 | `DEBUG`            | Enable debug logging or not?                                           | `1`                                           | `0`                                             |
 

diff --git a/src/env.py b/src/env.py
@@ -157,9 +157,18 @@ def __list_parser(var: Optional[str]) -> list[str]:
 
 # ----- img relay server config -----
 _img_relay_server = os.environ.get('IMG_RELAY_SERVER') or 'https://rsstt-img-relay.rongrong.workers.dev/'
-IMG_RELAY_SERVER: Final = _img_relay_server + ('' if _img_relay_server.endswith(('/', '=')) else '/')
+IMG_RELAY_SERVER: Final = ('https://' if not _img_relay_server.startswith('http') else '') \
+                          + _img_relay_server \
+                          + ('' if _img_relay_server.endswith(('/', '=')) else '/')
 del _img_relay_server
 
+# ----- images.weserv.nl config -----
+_images_weserv_nl = os.environ.get('IMAGES_WESERV_NL') or 'https://images.weserv.nl/'
+IMAGES_WESERV_NL: Final = ('https://' if not _images_weserv_nl.startswith('http') else '') \
+                          + _images_weserv_nl \
+                          + ('' if _images_weserv_nl.endswith('/') else '/')
+del _images_weserv_nl
+
 # ----- db config -----
 _database_url = os.environ.get('DATABASE_URL') or 'sqlite://config/db.sqlite3?journal_mode=OFF'
 DATABASE_URL: Final = (_database_url.replace('postgresql', 'postgres', 1) if _database_url.startswith('postgresql')

diff --git a/src/parsing/html_parser.py b/src/parsing/html_parser.py
@@ -10,7 +10,7 @@
 from attr import define
 
 from src import web
-from .medium import Video, Image, Media, Animation, Audio
+from .medium import Video, Image, Media, Animation, Audio, construct_images_weserv_nl_url
 from .html_node import *
 from .utils import stripNewline, stripLineEnd, is_absolute_link, emojify
 
@@ -156,16 +156,27 @@ async def _parse_item(self, soup: Union[PageElement, BeautifulSoup, Tag, Navigab
                 _multi_src.append(src) if src else None
             multi_src = []
             is_gif = False
+            is_webp = False
             for _src in _multi_src:
                 if not isinstance(_src, str):
                     continue
                 if not is_absolute_link(_src) and self.feed_link:
                     _src = urljoin(self.feed_link, _src)
-                if urlparse(_src).path.endswith(('.gif', '.gifv', '.webm', '.mp4', '.m4v', '.webp')):
+                path = urlparse(_src).path
+                if path.endswith(('.gif', '.gifv', '.webm', '.mp4', '.m4v')):
                     is_gif = True
+                if path.endswith('.webp'):
+                    is_webp = True
                 multi_src.append(_src)
             if multi_src:
-                self.media.add(Image(multi_src) if not is_gif else Animation(multi_src))
+                if is_webp:
+                    media = Image(multi_src)
+                    media.urls = [construct_images_weserv_nl_url(multi_src[0])]
+                    self.media.add(media)
+                elif is_gif:
+                    self.media.add(Animation(multi_src))
+                else:
+                    self.media.add(Image(multi_src))
             return None
 
         if tag == 'video':
@@ -223,7 +234,7 @@ async def _parse_item(self, soup: Union[PageElement, BeautifulSoup, Tag, Navigab
                 # noinspection PyBroadException
                 try:
                     page = await web.get(src, timeout=3, decode=True, semaphore=False)
-                    if page.status != 200:
+                    if page.status != 200 or not page.content:
                         raise ValueError
                     text = BeautifulSoup(page.content, 'lxml').title.text
                 except Exception:

diff --git a/src/parsing/medium.py b/src/parsing/medium.py
@@ -13,6 +13,7 @@
     MessageMediaPhoto, MessageMediaDocument
 from telethon.errors import FloodWaitError, SlowModeWaitError, ServerError
 from asyncstdlib.functools import lru_cache
+from urllib.parse import urlencode
 
 from src import env, log, web, locks
 from src.parsing.html_node import Link, Br, Text, HtmlTree
@@ -226,17 +227,23 @@ async def validate(self, flush: bool = False) -> bool:
         async with self.validating_lock:
             while self.urls:
                 url = self.urls.pop(0)
+                if url.startswith(env.IMAGES_WESERV_NL):
+                    self.valid = True
+                    self.chosen_url = url
+                    self._server_change_count = 0
+                    return True
                 medium_info = await get_medium_info(url)
                 if medium_info is None:
                     continue
                 self.size, self.width, self.height, self.content_type = medium_info
 
                 if self.type == IMAGE:
-                    # drop SVG
-                    if self.content_type and self.content_type.lower().startswith('image/svg'):
-                        self.valid = False
-                        self.drop_silently = True
-                        return False
+                    # force convert WEBP/SVG to PNG
+                    if self.content_type and self.content_type.find('webp') != -1 \
+                            or self.content_type.startswith('application') or self.content_type.find('svg') != -1:
+                        self.valid = True
+                        url = construct_images_weserv_nl_url(self.original_urls[0])
+                        self.urls = [url]
                     # always invalid
                     elif self.width + self.height > 10000 or self.size > self.maxSize:
                         self.valid = False
@@ -268,8 +275,6 @@ async def validate(self, flush: bool = False) -> bool:
                         await self.change_server()
                     return True
 
-                # TODO: reduce non-weibo pic size
-
             self.valid = False
             return await self.type_fallback()
 
@@ -377,9 +382,12 @@ class Image(Medium):
     def __init__(self, url: Union[str, list[str]]):
         super().__init__(url)
         new_urls = []
+        already_images_weserv_nl = False
         for url in self.urls:
             sinaimg_match = sinaimg_size_parser(url)
             pixiv_match = pixiv_size_parser(url)
+            if url.startswith(env.IMAGES_WESERV_NL):
+                already_images_weserv_nl = True
             if not any([sinaimg_match, pixiv_match]):
                 new_urls.append(url)
                 continue
@@ -400,6 +408,8 @@ def __init__(self, url: Union[str, list[str]]):
             if url not in new_urls:
                 new_urls.append(url)
         self.urls = new_urls
+        if not already_images_weserv_nl:
+            self.urls.append(construct_images_weserv_nl_url(self.urls[0]))  # use for final fallback
 
     async def change_server(self):
         sinaimg_server_match = sinaimg_server_parser(self.chosen_url)
@@ -637,6 +647,21 @@ def hash(self):
         return '|'.join(medium.hash for medium in self._media)
 
 
+def construct_images_weserv_nl_url(url: str,
+                                   width: int = 1280,
+                                   height: int = 1280,
+                                   fit: str = 'inside',
+                                   output_format: str = 'png') -> str:
+    query_string = urlencode({
+        'url': url,
+        'w': width,
+        'h': height,
+        'fit': fit,
+        'output': output_format,
+    })
+    return env.IMAGES_WESERV_NL + '?' + query_string
+
+
 @lru_cache(maxsize=1024)
 async def get_medium_info(url: str) -> Optional[tuple[int, int, int, Optional[str]]]:
     if url.startswith('data:'):
@@ -651,6 +676,7 @@ async def get_medium_info(url: str) -> Optional[tuple[int, int, int, Optional[st
 
     size = int(r.headers.get('Content-Length') or -1)
     content_type = r.headers.get('Content-Type')
+    content_type = content_type.lower() if content_type else None
     is_image = content_type and content_type.startswith('image/')
 
     width = height = -1

diff --git a/src/parsing/post_formatter.py b/src/parsing/post_formatter.py
@@ -19,7 +19,7 @@
 from .splitter import get_plain_text_length
 from .html_parser import parse
 from .html_node import *
-from .medium import Media, Image, Video, Audio, File
+from .medium import Media, Image, Video, Audio, File, construct_images_weserv_nl_url
 
 AUTO: Final = 0
 DISABLE: Final = -1
@@ -438,6 +438,9 @@ async def parse_html(self):
                     continue
                 elif not enclosure.type:
                     medium = File(enclosure.url)
+                elif enclosure.type.find('webp') != -1 or enclosure.type.find('svg') != -1:
+                    medium = Image(enclosure.url)
+                    medium.url = construct_images_weserv_nl_url(enclosure.url)
                 elif enclosure.type.startswith('image/gif'):
                     medium = Audio(enclosure.url)
                 elif enclosure.type.startswith('audio'):