Skip to content

Commit

Permalink
fix(route)(wechat/ce): anti-crawler and pubDate (#9495)
Browse files Browse the repository at this point in the history
Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
  • Loading branch information
Rongronggg9 authored Apr 8, 2022
1 parent ef94bcd commit 2dab2ad
Showing 1 changed file with 45 additions and 16 deletions.
61 changes: 45 additions & 16 deletions lib/routes/tencent/wechat/ce.js
Original file line number Diff line number Diff line change
@@ -1,33 +1,62 @@
const parser = require('@/utils/rss-parser');
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { fixArticleContent } = require('@/utils/wechat-mp');
const { parseDate } = require('@/utils/parse-date');

// any UA containing "RSS" can pass the check
// mark the UA as a desktop UA with "(X11; Linux x86_64)"
const UA = 'Mozilla/5.0 (X11; Linux x86_64) RSS Reader';

module.exports = async (ctx) => {
const { id } = ctx.params;

const feed = await parser.parseURL(`https://posts.careerengine.us/author/${id}/rss`);
const feed = await parser.parseString(
await got
.get(`https://posts.careerengine.us/author/${id}/rss`, {
headers: {
'User-Agent': UA,
},
})
.then((_) => _.data)
);

const items = await Promise.all(
feed.items.splice(0, 10).map(async (item) => {
const response = await got.get(item.link);
// generally speaking, changing `item.link` of an existing route could potentially break `item.guid`
// but since the route has been down for at least 8 months, it's probably safe
item.link = item.link.replace(/^http:\/\//, 'https://');
return await ctx.cache.tryGet(item.link, async () => {
const response = await got.get(item.link, {
headers: {
'User-Agent': UA,
},
});

const $ = cheerio.load(response.data);

const $ = cheerio.load(response.data);
const post = $('.post');
const description = fixArticleContent($('.post'));

post.find('img').each((_, img) => {
const dataSrc = $(img).attr('data-src');
if (dataSrc) {
$(img).attr('src', dataSrc);
let pubDate = item.pubDate;
if (!pubDate || pubDate === 'Invalid Date') {
// sometimes the pubDate is not available in the official feed
const postDate = $('.post-date')
.text()
.replace(/\s+|/g, '');
// the date format is "发表 YYYY年MM月DD日 "
// following the official feed behavior: imprecise date is in UTC
// `<pubDate>Mon, 04 Apr 2022 00:00:00 GMT</pubDate>`
pubDate = parseDate(postDate, 'YYYY年MM月DD日');
pubDate = new Date(pubDate.getTime() - pubDate.getTimezoneOffset() * 60 * 1000);
}
});

const single = {
title: item.title,
description: post.html(),
pubDate: item.pubDate,
link: item.link,
};
return Promise.resolve(single);
return {
title: item.title,
description,
pubDate,
link: item.link,
};
});
})
);

Expand Down

0 comments on commit 2dab2ad

Please sign in to comment.