feat(utils): add utils for WeChat MP (DIYgod#9487)

Motivation: There are multiple routes that need to fetch articles from WeChat MP. However, letting them fetch articles by themselves could potentially lead to cache key collisions. Even if cache key collisions do not occur, un-normalized URL could potentially lead to duplicated requests. What's more, articles from WeChat MP have weird formats and need to be fixed. Creating a universal function to do this work can create some ease for new route contributors. Note: In order to make this PR atomic as much as possible, I did not touch those broken routes. Once this PR is merged, I will try to fix them. Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
Naiqus · Apr 7, 2022 · a79cc20 · a79cc20
1 parent e0b7ca6
commit a79cc20
Show file tree

Hide file tree

Showing 12 changed files with 305 additions and 158 deletions.
diff --git a/docs/new-media.md b/docs/new-media.md
@@ -3344,7 +3344,7 @@ column 为 third 时可选的 category:
 
 ### 公众号（CareerEngine 来源）
 
-<Route author="HenryQW" example="/wechat/ce/595a5b14d7164e53908f1606" path="/wechat/ce/:id" :paramsDesc="['公众号 id, 在 [CareerEngine](https://search.careerengine.us/) 搜索公众号，通过 URL 中找到对应的公众号 id']"/>
+<Route author="HenryQW" example="/wechat/ce/595a5b14d7164e53908f1606" path="/wechat/ce/:id" :paramsDesc="['公众号 id, 在 [CareerEngine](https://search.careerengine.us/) 搜索公众号，通过 URL 中找到对应的公众号 id']" anticrawler="1"/>
 
 ### 公众号（Telegram 频道来源）
 
@@ -3380,7 +3380,7 @@ column 为 third 时可选的 category:
 
 ### 公众号 (wxnmh.com 来源)
 
-<Route author="laampui" example="/wechat/wxnmh/51798" path="/wechat/wxnmh/:id" :paramsDesc="['公众号 id, 打开 wxnmh.com, 在 URL 中找到 id']"/>
+<Route author="laampui" example="/wechat/wxnmh/51798" path="/wechat/wxnmh/:id" :paramsDesc="['公众号 id, 打开 wxnmh.com, 在 URL 中找到 id']" anticrawler="1"/>
 
 ### 公众号 (wechat-feeds 来源)
 

diff --git a/lib/routes/tencent/wechat/_README b/lib/routes/tencent/wechat/_README
@@ -0,0 +1 @@
+Make sure you read lib/utils/wechat-mp.js before adding a new route.
diff --git a/lib/routes/tencent/wechat/feeds.js b/lib/routes/tencent/wechat/feeds.js
@@ -1,42 +1,18 @@
 const parser = require('@/utils/rss-parser');
-const got = require('@/utils/got');
-const cheerio = require('cheerio');
+const { finishArticleItem } = require('@/utils/wechat-mp');
 
 module.exports = async (ctx) => {
     const { id } = ctx.params;
     const link = `https://github.com/hellodword/wechat-feeds/raw/feeds/${id}.xml`;
     const feed = await parser.parseURL(link);
 
-    const items = await Promise.all(
-        feed.items.map(async (item) => {
-            const cache = await ctx.cache.get(item.link);
-            if (cache) {
-                return Promise.resolve(JSON.parse(cache));
-            }
-
-            const response = await got.get(item.link);
-
-            const $ = cheerio.load(response.data);
-            const post = $('#js_content');
-
-            post.find('img').each((_, img) => {
-                const dataSrc = $(img).attr('data-src');
-                if (dataSrc) {
-                    $(img).attr('src', dataSrc);
-                }
-            });
-
-            const single = {
-                title: item.title,
-                description: post.html(),
-                pubDate: new Date(item.pubDate),
-                link: item.link,
-            };
-
-            ctx.cache.set(item.link, JSON.stringify(single));
-            return Promise.resolve(single);
-        })
-    );
+    const items = feed.items.map((item) => ({
+        title: item.title,
+        pubDate: new Date(item.pubDate),
+        link: item.link,
+        guid: item.link,
+    }));
+    await Promise.all(items.map(async (item) => await finishArticleItem(ctx, item)));
 
     ctx.state.data = {
         title: feed.title,

diff --git a/lib/routes/tencent/wechat/mp.js b/lib/routes/tencent/wechat/mp.js
@@ -1,6 +1,8 @@
 const got = require('@/utils/got');
 const cheerio = require('cheerio');
 const dayjs = require('dayjs');
+const { finishArticleItem } = require('@/utils/wechat-mp');
+
 module.exports = async (ctx) => {
     const { biz, hid, cid } = ctx.params;
     let cidurl = '';
@@ -26,32 +28,11 @@ module.exports = async (ctx) => {
     const mptitle = $('div.articles_header').find('a').text() + `|` + $('div.articles_header > h2.rich_media_title').text();
     const articledata = await Promise.all(
         list.map(async (item) => {
-            const link = item.link.replace('http://', 'https://');
-            const cache = await ctx.cache.get(link);
-            if (cache) {
-                return Promise.resolve(JSON.parse(cache));
-            }
-            const response2 = await got({
-                method: 'get',
-                url: link,
-            });
-            const articleHtml = response2.data;
-            const $2 = cheerio.load(articleHtml);
-            $2('img').removeAttr('src');
-            $2('div#js_profile_qrcode').remove();
-
-            const content = $2('div#js_content.rich_media_content')
-                .html()
-                .replace('iframe/preview.html?width=500&amp;height=375&amp;', 'txp/iframe/player.html?')
-                .replace('<iframe ', '<iframe width="640" height="360"')
-                .replace(/data-src/g, 'src');
-            const author = $2('div#meta_content:not(:last-child)').text();
             const single = {
-                content,
-                author,
+                link: item.link,
+                guid: item.link,
             };
-            ctx.cache.set(link, JSON.stringify(single));
-            return Promise.resolve(single);
+            return await finishArticleItem(ctx, single);
         })
     );
     ctx.state.data = {
@@ -67,9 +48,10 @@ module.exports = async (ctx) => {
                     src="${item.cover}"
                 ><br>
                 <br>
-                ${articledata[index].content}
+                ${articledata[index].description}
             `,
-            link: item.link,
+            link: articledata[index].link,
+            guid: articledata[index].guid,
             author: articledata[index].author,
             pubDate: dayjs.unix(item.sendtime).format(),
         })),

diff --git a/lib/routes/tencent/wechat/msgalbum.js b/lib/routes/tencent/wechat/msgalbum.js
@@ -1,6 +1,8 @@
 const got = require('@/utils/got');
 const cheerio = require('cheerio');
 const dayjs = require('dayjs');
+const { finishArticleItem } = require('@/utils/wechat-mp');
+
 module.exports = async (ctx) => {
     const { biz, aid } = ctx.params;
     const aidurl = `&album_id=${aid}`;
@@ -16,42 +18,22 @@ module.exports = async (ctx) => {
         list.map(async (item) => {
             const link = $(item).attr('data-link').replace('http://', 'https://');
             const title = $(item).attr('data-title');
-            const cache = await ctx.cache.get(link);
-            if (cache) {
-                return Promise.resolve(JSON.parse(cache));
-            }
-            const response2 = await got({
-                method: 'get',
-                url: link,
-            });
-            const articleHtml = response2.data;
-            const $2 = cheerio.load(articleHtml);
-            $2('img').removeAttr('src');
-            $2('div#js_profile_qrcode').remove();
-
-            const content = $2('div#js_content.rich_media_content')
-                .html()
-                .replace('iframe/preview.html?width=500&amp;height=375&amp;', 'txp/iframe/player.html?')
-                .replace('<iframe ', '<iframe width="640" height="360"')
-                .replace(/data-src/g, 'src');
-            const author = $2('div#meta_content:not(:last-child)').text();
             const single = {
-                content,
-                author,
-                link,
                 title,
+                link,
+                guid: link,
             };
-            ctx.cache.set(link, JSON.stringify(single));
-            return Promise.resolve(single);
+            return await finishArticleItem(ctx, single);
         })
     );
     ctx.state.data = {
         title: mptitle,
         link: `https://mp.weixin.qq.com/mp/appmsgalbum?__biz=${biz}&action=getalbum${aidurl}`,
         item: list.map((item, index) => ({
             title: articledata[index].title,
-            description: $(item).find('.album__item-img').html() + `<br><br>${articledata[index].content}`,
+            description: $(item).find('.album__item-img').html() + `<br><br>${articledata[index].description}`,
             link: articledata[index].link,
+            guid: articledata[index].guid,
             author: articledata[index].author,
             pubDate: dayjs.unix($(item).find('.js_article_create_time').text()).format(),
         })),

diff --git a/lib/routes/tencent/wechat/tgchannel.js b/lib/routes/tencent/wechat/tgchannel.js
@@ -1,5 +1,6 @@
 const got = require('@/utils/got');
 const cheerio = require('cheerio');
+const { finishArticleItem } = require('@/utils/wechat-mp');
 
 module.exports = async (ctx) => {
     const id = ctx.params.id;
@@ -120,37 +121,16 @@ module.exports = async (ctx) => {
                     title,
                     pubDate,
                     link,
-                    author,
+                    guid: link,
                 };
 
                 if (link !== undefined) {
-                    const value = await ctx.cache.get(link);
-                    if (value) {
-                        single.description = value;
-                    } else {
-                        try {
-                            const reponse = await got.get(link);
-                            const $ = cheerio.load(reponse.data);
-
-                            single.description = $('.rich_media_content')
-                                .html()
-                                .replace(/data-src/g, 'src');
-                            ctx.cache.set(link, single.description, 12 * 60 * 60);
-                        } catch (err) {
-                            single.description = item.find('.tgme_widget_message_text').html();
-                        }
+                    try {
+                        return await finishArticleItem(ctx, single);
+                    } catch (err) {
+                        single.description = item.find('.tgme_widget_message_text').html();
                     }
                 }
-
-                // 修复文字格式错误
-                single.description = single.description
-                    .replace(/(<strong.*?>)(.*?)(<\/strong>)/g, '$1<span style="font-size: 16px; line-height: 16px;">$2</span>$3')
-                    .replace(/<section(.*?)>(.*?)<\/section>/g, '<p $1>$2</p>')
-                    .replace(/(<p.*?>)(.*?)(<\/p>)/g, '$1<span style="font-size: 16px; line-height: 16px;">$2</span>$3')
-                    .replace(/<p.*?data-encc.*?>.*?<\/p>/g, '')
-                    .replace(/<h\d(.*?)>(.*?)<\/h\d>/g, '<p $1>$2</p>')
-                    .replace(/<br.*?>/g, '');
-
                 return single;
             })
             .get()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Make sure you read lib/utils/wechat-mp.js before adding a new route.