Skip to content

Commit

Permalink
feat(utils): add utils for WeChat MP (DIYgod#9487)
Browse files Browse the repository at this point in the history
Motivation:
There are multiple routes that need to fetch articles from WeChat MP.
However, letting them fetch articles by themselves could potentially
lead to cache key collisions. Even if cache key collisions do not occur,
un-normalized URL could potentially lead to duplicated requests.
What's more, articles from WeChat MP have weird formats and need to be
fixed. Creating a universal function to do this work can create some
ease for new route contributors.

Note:
In order to make this PR atomic as much as possible, I did not touch
those broken routes. Once this PR is merged, I will try to fix them.

Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
  • Loading branch information
Rongronggg9 authored Apr 7, 2022
1 parent e0b7ca6 commit a79cc20
Show file tree
Hide file tree
Showing 12 changed files with 305 additions and 158 deletions.
4 changes: 2 additions & 2 deletions docs/new-media.md
Original file line number Diff line number Diff line change
Expand Up @@ -3344,7 +3344,7 @@ column 为 third 时可选的 category:

### 公众号(CareerEngine 来源)

<Route author="HenryQW" example="/wechat/ce/595a5b14d7164e53908f1606" path="/wechat/ce/:id" :paramsDesc="['公众号 id, 在 [CareerEngine](https://search.careerengine.us/) 搜索公众号,通过 URL 中找到对应的公众号 id']"/>
<Route author="HenryQW" example="/wechat/ce/595a5b14d7164e53908f1606" path="/wechat/ce/:id" :paramsDesc="['公众号 id, 在 [CareerEngine](https://search.careerengine.us/) 搜索公众号,通过 URL 中找到对应的公众号 id']" anticrawler="1"/>

### 公众号(Telegram 频道来源)

Expand Down Expand Up @@ -3380,7 +3380,7 @@ column 为 third 时可选的 category:

### 公众号 (wxnmh.com 来源)

<Route author="laampui" example="/wechat/wxnmh/51798" path="/wechat/wxnmh/:id" :paramsDesc="['公众号 id, 打开 wxnmh.com, 在 URL 中找到 id']"/>
<Route author="laampui" example="/wechat/wxnmh/51798" path="/wechat/wxnmh/:id" :paramsDesc="['公众号 id, 打开 wxnmh.com, 在 URL 中找到 id']" anticrawler="1"/>

### 公众号 (wechat-feeds 来源)

Expand Down
1 change: 1 addition & 0 deletions lib/routes/tencent/wechat/_README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make sure you read lib/utils/wechat-mp.js before adding a new route.
40 changes: 8 additions & 32 deletions lib/routes/tencent/wechat/feeds.js
Original file line number Diff line number Diff line change
@@ -1,42 +1,18 @@
const parser = require('@/utils/rss-parser');
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { finishArticleItem } = require('@/utils/wechat-mp');

module.exports = async (ctx) => {
const { id } = ctx.params;
const link = `https://github.com/hellodword/wechat-feeds/raw/feeds/${id}.xml`;
const feed = await parser.parseURL(link);

const items = await Promise.all(
feed.items.map(async (item) => {
const cache = await ctx.cache.get(item.link);
if (cache) {
return Promise.resolve(JSON.parse(cache));
}

const response = await got.get(item.link);

const $ = cheerio.load(response.data);
const post = $('#js_content');

post.find('img').each((_, img) => {
const dataSrc = $(img).attr('data-src');
if (dataSrc) {
$(img).attr('src', dataSrc);
}
});

const single = {
title: item.title,
description: post.html(),
pubDate: new Date(item.pubDate),
link: item.link,
};

ctx.cache.set(item.link, JSON.stringify(single));
return Promise.resolve(single);
})
);
const items = feed.items.map((item) => ({
title: item.title,
pubDate: new Date(item.pubDate),
link: item.link,
guid: item.link,
}));
await Promise.all(items.map(async (item) => await finishArticleItem(ctx, item)));

ctx.state.data = {
title: feed.title,
Expand Down
34 changes: 8 additions & 26 deletions lib/routes/tencent/wechat/mp.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const dayjs = require('dayjs');
const { finishArticleItem } = require('@/utils/wechat-mp');

module.exports = async (ctx) => {
const { biz, hid, cid } = ctx.params;
let cidurl = '';
Expand All @@ -26,32 +28,11 @@ module.exports = async (ctx) => {
const mptitle = $('div.articles_header').find('a').text() + `|` + $('div.articles_header > h2.rich_media_title').text();
const articledata = await Promise.all(
list.map(async (item) => {
const link = item.link.replace('http://', 'https://');
const cache = await ctx.cache.get(link);
if (cache) {
return Promise.resolve(JSON.parse(cache));
}
const response2 = await got({
method: 'get',
url: link,
});
const articleHtml = response2.data;
const $2 = cheerio.load(articleHtml);
$2('img').removeAttr('src');
$2('div#js_profile_qrcode').remove();

const content = $2('div#js_content.rich_media_content')
.html()
.replace('iframe/preview.html?width=500&amp;height=375&amp;', 'txp/iframe/player.html?')
.replace('<iframe ', '<iframe width="640" height="360"')
.replace(/data-src/g, 'src');
const author = $2('div#meta_content:not(:last-child)').text();
const single = {
content,
author,
link: item.link,
guid: item.link,
};
ctx.cache.set(link, JSON.stringify(single));
return Promise.resolve(single);
return await finishArticleItem(ctx, single);
})
);
ctx.state.data = {
Expand All @@ -67,9 +48,10 @@ module.exports = async (ctx) => {
src="${item.cover}"
><br>
<br>
${articledata[index].content}
${articledata[index].description}
`,
link: item.link,
link: articledata[index].link,
guid: articledata[index].guid,
author: articledata[index].author,
pubDate: dayjs.unix(item.sendtime).format(),
})),
Expand Down
32 changes: 7 additions & 25 deletions lib/routes/tencent/wechat/msgalbum.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const dayjs = require('dayjs');
const { finishArticleItem } = require('@/utils/wechat-mp');

module.exports = async (ctx) => {
const { biz, aid } = ctx.params;
const aidurl = `&album_id=${aid}`;
Expand All @@ -16,42 +18,22 @@ module.exports = async (ctx) => {
list.map(async (item) => {
const link = $(item).attr('data-link').replace('http://', 'https://');
const title = $(item).attr('data-title');
const cache = await ctx.cache.get(link);
if (cache) {
return Promise.resolve(JSON.parse(cache));
}
const response2 = await got({
method: 'get',
url: link,
});
const articleHtml = response2.data;
const $2 = cheerio.load(articleHtml);
$2('img').removeAttr('src');
$2('div#js_profile_qrcode').remove();

const content = $2('div#js_content.rich_media_content')
.html()
.replace('iframe/preview.html?width=500&amp;height=375&amp;', 'txp/iframe/player.html?')
.replace('<iframe ', '<iframe width="640" height="360"')
.replace(/data-src/g, 'src');
const author = $2('div#meta_content:not(:last-child)').text();
const single = {
content,
author,
link,
title,
link,
guid: link,
};
ctx.cache.set(link, JSON.stringify(single));
return Promise.resolve(single);
return await finishArticleItem(ctx, single);
})
);
ctx.state.data = {
title: mptitle,
link: `https://mp.weixin.qq.com/mp/appmsgalbum?__biz=${biz}&action=getalbum${aidurl}`,
item: list.map((item, index) => ({
title: articledata[index].title,
description: $(item).find('.album__item-img').html() + `<br><br>${articledata[index].content}`,
description: $(item).find('.album__item-img').html() + `<br><br>${articledata[index].description}`,
link: articledata[index].link,
guid: articledata[index].guid,
author: articledata[index].author,
pubDate: dayjs.unix($(item).find('.js_article_create_time').text()).format(),
})),
Expand Down
32 changes: 6 additions & 26 deletions lib/routes/tencent/wechat/tgchannel.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { finishArticleItem } = require('@/utils/wechat-mp');

module.exports = async (ctx) => {
const id = ctx.params.id;
Expand Down Expand Up @@ -120,37 +121,16 @@ module.exports = async (ctx) => {
title,
pubDate,
link,
author,
guid: link,
};

if (link !== undefined) {
const value = await ctx.cache.get(link);
if (value) {
single.description = value;
} else {
try {
const reponse = await got.get(link);
const $ = cheerio.load(reponse.data);

single.description = $('.rich_media_content')
.html()
.replace(/data-src/g, 'src');
ctx.cache.set(link, single.description, 12 * 60 * 60);
} catch (err) {
single.description = item.find('.tgme_widget_message_text').html();
}
try {
return await finishArticleItem(ctx, single);
} catch (err) {
single.description = item.find('.tgme_widget_message_text').html();
}
}

// 修复文字格式错误
single.description = single.description
.replace(/(<strong.*?>)(.*?)(<\/strong>)/g, '$1<span style="font-size: 16px; line-height: 16px;">$2</span>$3')
.replace(/<section(.*?)>(.*?)<\/section>/g, '<p $1>$2</p>')
.replace(/(<p.*?>)(.*?)(<\/p>)/g, '$1<span style="font-size: 16px; line-height: 16px;">$2</span>$3')
.replace(/<p.*?data-encc.*?>.*?<\/p>/g, '')
.replace(/<h\d(.*?)>(.*?)<\/h\d>/g, '<p $1>$2</p>')
.replace(/<br.*?>/g, '');

return single;
})
.get()
Expand Down
Loading

0 comments on commit a79cc20

Please sign in to comment.