From 1498a743c6fc615d32b62cd4c54fa7773826e81e Mon Sep 17 00:00:00 2001 From: Rongrong <15956627+Rongronggg9@users.noreply.github.com> Date: Tue, 22 Mar 2022 01:59:31 +0800 Subject: [PATCH] feat(*): env var `TABLE_TO_IMAGE` to determine should HTML tables be converted to image or just be dropped Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com> --- .env.sample | 1 + docker-compose.yml.sample | 1 + docs/advanced-settings.md | 1 + src/env.py | 3 +- src/parsing/html_parser.py | 5 +-- src/parsing/table_drawer.py | 61 +++++++++++++++++++++---------------- 6 files changed, 43 insertions(+), 29 deletions(-) diff --git a/.env.sample b/.env.sample index db2f564925..dbe72f57b8 100644 --- a/.env.sample +++ b/.env.sample @@ -30,5 +30,6 @@ TELEGRAPH_TOKEN=" #R_PROXY=socks5://172.17.0.1:1080 # Proxy used to fetch feeds #PROXY_BYPASS_PRIVATE=1 # default: 0 #PROXY_BYPASS_DOMAINS=example.com;example.net +#TABLE_TO_IMAGE=1 # default: 0 #DEBUG=1 # debug logging, default: 0 # ↑------ Advanced settings ------↑ # diff --git a/docker-compose.yml.sample b/docker-compose.yml.sample index 7347274f68..16327aa806 100644 --- a/docker-compose.yml.sample +++ b/docker-compose.yml.sample @@ -40,5 +40,6 @@ services: #- R_PROXY=socks5://172.17.0.1:1080 # Proxy used to fetch feeds #- PROXY_BYPASS_PRIVATE=1 # default: 0 #- PROXY_BYPASS_DOMAINS=example.com;example.net + #- TABLE_TO_IMAGE=1 # default: 0 #- DEBUG=1 # debug logging, default: 0 # ↑------ Advanced settings ------↑ # \ No newline at end of file diff --git a/docs/advanced-settings.md b/docs/advanced-settings.md index d6fd5298e4..d15d712320 100644 --- a/docs/advanced-settings.md +++ b/docs/advanced-settings.md @@ -45,6 +45,7 @@ | `IMG_RELAY_SERVER` | Media relay server URL | `https://images.weserv.nl/?url=` | `https://rsstt-img-relay.rongrong.workers.dev/` | | `IMAGES_WESERV_NL` | images.weserv.nl URL | `https://t0.nl/` | `https://images.weserv.nl/` | | `DATABASE_URL` | Database URL [^5] | `postgres://user:pass@example.com:5432/table` | `sqlite://config/db.sqlite3?journal_mode=OFF` | +| `TABLE_TO_IMAGE` | Convert tables to image (causing high CPU usage) or just drop them? | `1` | `0` | | `DEBUG` | Enable debug logging or not? | `1` | `0` | ## Manager options diff --git a/src/env.py b/src/env.py index 707bccc778..049f4449c9 100644 --- a/src/env.py +++ b/src/env.py @@ -175,8 +175,9 @@ def __list_parser(var: Optional[str]) -> list[str]: else _database_url) del _database_url -# ----- debug config ----- +# ----- misc config ----- DEBUG: Final = __bool_parser(os.environ.get('DEBUG')) +TABLE_TO_IMAGE: Final = __bool_parser(os.environ.get('TABLE_TO_IMAGE')) # ----- environment config ----- RAILWAY_STATIC_URL: Final = os.environ.get('RAILWAY_STATIC_URL') diff --git a/src/parsing/html_parser.py b/src/parsing/html_parser.py index 463a2ea3f1..ea7f241bda 100644 --- a/src/parsing/html_parser.py +++ b/src/parsing/html_parser.py @@ -8,7 +8,7 @@ from urllib.parse import urlparse from attr import define -from src import web +from src import web, env from .medium import Video, Image, Media, Animation, Audio, UploadedImage from .html_node import * from .utils import stripNewline, stripLineEnd, isAbsoluteHttpLink, resolve_relative_link, emojify, is_emoticon @@ -88,7 +88,8 @@ async def _parse_item(self, soup: Union[PageElement, BeautifulSoup, Tag, Navigab for row in rows: columns = row.findAll(('td', 'th')) if len(columns) != 1: - self.media.add(UploadedImage(convert_table_to_png(str(soup)))) + if env.TABLE_TO_IMAGE: + self.media.add(UploadedImage(convert_table_to_png(str(soup)))) return None row_content = await self._parse_item(columns[0]) if row_content: diff --git a/src/parsing/table_drawer.py b/src/parsing/table_drawer.py index 39ae2000a2..23b0666943 100644 --- a/src/parsing/table_drawer.py +++ b/src/parsing/table_drawer.py @@ -33,7 +33,7 @@ plt.rcParams['axes.unicode_minus'] = False filterwarnings('error', 'constrained_layout not applied', UserWarning) - +filterwarnings('ignore', "coroutine 'convert_table_to_png' was never awaited", RuntimeWarning) def _convert_table_to_png(table_html: str) -> Optional[BytesIO]: soup = BeautifulSoup(table_html, 'lxml') @@ -87,40 +87,49 @@ def _convert_table_to_png(table_html: str) -> Optional[BytesIO]: auto_set_column_width_flag = True for tries in range(2): - # draw table - fig, ax = plt.subplots(figsize=(8, 8)) - table = ax.table(cellText=cell_texts, - rowLabels=row_labels or None, - colLabels=column_labels or None, - loc='center', - cellLoc='center', - rowLoc='center') - row_heights = defaultdict(lambda: 0) - if auto_set_column_width_flag: - table.auto_set_column_width(tuple(range(max_columns))) - # set row height - for xy, cell in table.get_celld().items(): - text = cell.get_text().get_text() - text = fill(text.strip(), wrap_length) - cell.get_text().set_text(text) - row_heights[xy[0]] = max( - cell.get_height() * (text.count('\n') + 1) * 0.75 + cell.get_height() * 0.25, - row_heights[xy[0]] - ) - for xy, cell in table.get_celld().items(): - cell.set_height(row_heights[xy[0]]) - fig.set_constrained_layout(True) - ax.axis('off') - buffer = BytesIO() try: + # draw table + fig, ax = plt.subplots(figsize=(8, 8)) + table = ax.table(cellText=cell_texts, + rowLabels=row_labels or None, + colLabels=column_labels or None, + loc='center', + cellLoc='center', + rowLoc='center') + row_heights = defaultdict(lambda: 0) + if auto_set_column_width_flag: + table.auto_set_column_width(tuple(range(max_columns))) + # set row height + for xy, cell in table.get_celld().items(): + text = cell.get_text().get_text() + text = fill(text.strip(), wrap_length) + cell.get_text().set_text(text) + row_heights[xy[0]] = max( + cell.get_height() * (text.count('\n') + 1) * 0.75 + cell.get_height() * 0.25, + row_heights[xy[0]] + ) + for xy, cell in table.get_celld().items(): + cell.set_height(row_heights[xy[0]]) + fig.set_constrained_layout(True) + ax.axis('off') + buffer = BytesIO() fig.savefig(buffer, format='png', dpi=200) except UserWarning: # if auto_set_column_width_flag: # auto_set_column_width_flag = False # oops, overflowed! # continue # once a figure is exported, some stuff may be frozen, so we need to re-create the table return None + except Exception as e: + raise e + finally: + # noinspection PyBroadException + try: + plt.close() + except Exception: + pass # crop + # noinspection PyUnboundLocalVariable image = Image.open(buffer) ori_width, ori_height = image.size upper = left = float('inf')