Skip to content

Commit

Permalink
refactor(route): wsj news (DIYgod#12087)
Browse files Browse the repository at this point in the history
* feat(route): wsj news

* chore: docs update

* chore: docs update

* chore: use map to rewrite the function

* chore: update utils

* Update docs/en/traditional-media.md
---------
  • Loading branch information
EthanWng97 authored Mar 13, 2023
1 parent 6f04aff commit 633aec3
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 146 deletions.
12 changes: 11 additions & 1 deletion docs/en/traditional-media.md
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,17 @@ Provides all of the articles by the specified New York Times author.

### News

<RouteEn author="oppilate" example="/wsj/en-us/opinion" path="/wsj/:lang/:category?" :paramsDesc="['Language, `en-us`, `zh-cn`, `zh-tw` are supported', 'Category, only supported in `en-us`. Supports `opinion`, `world_news`, `us_bussiness`, `market_news`, `technology`, `lifestyle`.']">
<RouteEn author="oppilate NavePnow" example="/wsj/en-us/opinion" path="/wsj/:lang/:category?" :paramsDesc="['Language, `en-us`, `zh-cn`, `zh-tw`', 'Category. See below']">

en_us
| World | U.S. | Politics | Economy | Business | Tech | Markets | Opinion | Books & Arts | Real Estate | Life & Work | Sytle | Sports |
| ------ | ------- | -------- | -------- | ----- | --------- | --------- | --------- | --------- | --------- |--------- | --------- | --------- |
| world | us | politics | economy | business | technology | markets | opinion | books-arts | realestate | life-work | style-entertainment | sports |

zh-cn / zh-tw
| 国际 | 中国 | 金融市场 | 经济 | 商业 | 科技 || 专栏与观点 |
| ------ | ------- | -------- | -------- | ----- | --------- | --------- | --------- |
| world | china | markets | economy | business | technology | life-arts | opinion |

Provide full article RSS for WSJ topics.

Expand Down
14 changes: 13 additions & 1 deletion docs/traditional-media.md
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,19 @@ IT・科学 tech_science

### 新闻

<Route author="oppilate" example="/wsj/en-us/opinion" path="/wsj/:lang/:category?" :paramsDesc="['语言,支持 `en-us`、`zh-cn`、`zh-tw`', '分类,仅 `en-us` 支持分类订阅。支持 `opinion`, `world_news`, `us_bussiness`, `market_news`, `technology`, `lifestyle`。']">
<Route author="oppilate NavePnow" example="/wsj/en-us/opinion" path="/wsj/:lang/:category?" :paramsDesc="['语言,支持 `en-us`、`zh-cn`、`zh-tw`', '分类,见下表']">

en_us

| World | U.S. | Politics | Economy | Business | Tech | Markets | Opinion | Books & Arts | Real Estate | Life & Work | Sytle | Sports |
| ----- | ---- | -------- | ------- | -------- | ---------- | ------- | ------- | ------------ | ----------- | ----------- | ------------------- | ------ |
| world | us | politics | economy | business | technology | markets | opinion | books-arts | realestate | life-work | style-entertainment | sports |

zh-cn / zh-tw

| 国际 | 中国 | 金融市场 | 经济 | 商业 | 科技 || 专栏与观点 |
| ----- | ----- | ------- | ------- | -------- | ---------- | --------- | ------- |
| world | china | markets | economy | business | technology | life-arts | opinion |

通过提取文章全文,以提供比官方源更佳的阅读体验。

Expand Down
141 changes: 0 additions & 141 deletions lib/v2/wsj/index.js

This file was deleted.

51 changes: 51 additions & 0 deletions lib/v2/wsj/news.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { asyncPoolAll, parseArticle } = require('./utils');
const hostMap = {
'en-us': 'https://www.wsj.com',
'zh-cn': 'https://cn.wsj.com/zh-hans',
'zh-tw': 'https://cn.wsj.com/zh-hant',
};
module.exports = async (ctx) => {
const lang = ctx.params.lang;
const category = ctx.params.category || '';
const host = hostMap[lang];
let subTitle = ` - ${lang.toUpperCase()}`;
let url = host;
if (category.length > 0) {
url = `${host}/news/${category}`;
subTitle = `${subTitle} - ${category}`;
}
const response = await got({
method: 'get',
url,
});

const $ = cheerio.load(response.data);
const contents = $('script:contains("window.__STATE__")').text();
const data = JSON.parse(contents.match(/\{.*\}/)[0]).data;
const filteredKeys = Object.entries(data)
.filter(([key, value]) => {
if (!key.startsWith('article')) {
return false;
}
const link = value.data.data.url;
return link.includes('wsj.com/articles/');
})
.map(([key]) => key);
const list = filteredKeys.map((key) => {
const item = {};
item.title = data[key].data.data.headline;
item.link = data[key].data.data.url;
item.test = key;
return item;
});
const items = await asyncPoolAll(1, list, (item) => parseArticle(item, ctx));

ctx.state.data = {
title: `WSJ${subTitle}`,
link: url,
description: `WSJ${subTitle}`,
item: items,
};
};
4 changes: 2 additions & 2 deletions lib/v2/wsj/radar.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ module.exports = {
{
title: '新闻',
docs: 'https://docs.rsshub.app/traditional-media.html#hua-er-jie-ri-bao-the-wall-street-journal-wsj',
souce: '/',
source: '/',
target: '/wsj/zh-cn',
},
],
www: [
{
title: '新闻',
docs: 'https://docs.rsshub.app/traditional-media.html#hua-er-jie-ri-bao-the-wall-street-journal-wsj',
souce: '/',
source: '/',
target: '/wsj/en-us',
},
],
Expand Down
2 changes: 1 addition & 1 deletion lib/v2/wsj/router.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module.exports = (router) => {
router.get('/:lang/:category?', require('./index'));
router.get('/:lang/:category?', require('./news'));
};
99 changes: 99 additions & 0 deletions lib/v2/wsj/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
const asyncPool = require('tiny-async-pool');
const cheerio = require('cheerio');
const got = require('@/utils/got');
const { parseDate } = require('@/utils/parse-date');
const UA = require('@/utils/rand-user-agent')({ browser: 'chrome', os: 'android', device: 'mobile' });

// const chromeMobileUserAgent = 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36';
const parseArticle = (item, ctx) =>
ctx.cache.tryGet(item.link, async () => {
// Fetch the AMP version
const url = item.link.replace(/(?<=^https:\/\/\w+\.wsj\.com)/, '/amp');
const response = await got({
url,
method: 'get',
headers: {
'User-Agent': UA,
},
});
const html = response.data;
const $ = cheerio.load(html);
const content = $('.articleBody > section');

// Cover
const cover = $('.articleLead > div.is-lead-inset > div.header > .img-header > div.image-container > amp-img > img');

if (cover.length > 0) {
$(`<img src=${cover[0].attribs.content}>`).insertBefore(content[0].childNodes[0]);
$(cover).remove();
}

// Summary
const summary = $('head > meta[name="description"]').attr('content');

// Metadata (categories & updatedAt)
const updatedAt = $('meta[itemprop="dateModified"]').attr('content');
const publishedAt = $('meta[itemprop="datePublished"]').attr('content');
const author = $('.author > a[rel="author"]').text();

const categories = $('meta[name="keywords"]')
.attr('content')
.split(',')
.map((c) => c.trim());

// Images
content.find('amp-img').each((i, e) => {
const img = $(`<img width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}" alt="${e.attribs.alt}">`);

// Caption follows, no need to handle caption
$(img).insertBefore(e);
$(e).remove();
});

// iframes (youtube videos and interactive elements)
content.find('amp-iframe').each((i, e) => {
const iframe = $(`<iframe width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}">`);
$(iframe).insertBefore(e);
$(e).remove();
});

// Remove unwanted DOMs
const unwanted_element_selectors = ['amp-ad', '.wsj-ad'];
unwanted_element_selectors.forEach((selector) => {
content.find(selector).each((i, e) => {
$(e).remove();
});
});

// Paywall
content.find('.paywall').each((i, e) => {
// Caption follows, no need to handle caption
$(e.childNodes).insertBefore(e);
$(e).remove();
});

return {
title: item.title,
pubDate: parseDate(publishedAt),
updated: parseDate(updatedAt),
author,
link: item.link,
summary,
description: content.html(),
category: categories,
icon: 'https://s.wsj.net/media/wsj_launcher-icon-4x.png',
logo: 'https://vir.wsj.net/fp/assets/webpack4/img/wsj-logo-big-black.165e51cc.svg',
};
});

const asyncPoolAll = async (...args) => {
const results = [];
for await (const result of asyncPool(...args)) {
results.push(result);
}
return results;
};
module.exports = {
asyncPoolAll,
parseArticle,
};

0 comments on commit 633aec3

Please sign in to comment.