forked from DIYgod/RSSHub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor(route): wsj news (DIYgod#12087)
* feat(route): wsj news * chore: docs update * chore: docs update * chore: use map to rewrite the function * chore: update utils * Update docs/en/traditional-media.md ---------
- Loading branch information
1 parent
6f04aff
commit 633aec3
Showing
7 changed files
with
177 additions
and
146 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
const got = require('@/utils/got'); | ||
const cheerio = require('cheerio'); | ||
const { asyncPoolAll, parseArticle } = require('./utils'); | ||
const hostMap = { | ||
'en-us': 'https://www.wsj.com', | ||
'zh-cn': 'https://cn.wsj.com/zh-hans', | ||
'zh-tw': 'https://cn.wsj.com/zh-hant', | ||
}; | ||
module.exports = async (ctx) => { | ||
const lang = ctx.params.lang; | ||
const category = ctx.params.category || ''; | ||
const host = hostMap[lang]; | ||
let subTitle = ` - ${lang.toUpperCase()}`; | ||
let url = host; | ||
if (category.length > 0) { | ||
url = `${host}/news/${category}`; | ||
subTitle = `${subTitle} - ${category}`; | ||
} | ||
const response = await got({ | ||
method: 'get', | ||
url, | ||
}); | ||
|
||
const $ = cheerio.load(response.data); | ||
const contents = $('script:contains("window.__STATE__")').text(); | ||
const data = JSON.parse(contents.match(/\{.*\}/)[0]).data; | ||
const filteredKeys = Object.entries(data) | ||
.filter(([key, value]) => { | ||
if (!key.startsWith('article')) { | ||
return false; | ||
} | ||
const link = value.data.data.url; | ||
return link.includes('wsj.com/articles/'); | ||
}) | ||
.map(([key]) => key); | ||
const list = filteredKeys.map((key) => { | ||
const item = {}; | ||
item.title = data[key].data.data.headline; | ||
item.link = data[key].data.data.url; | ||
item.test = key; | ||
return item; | ||
}); | ||
const items = await asyncPoolAll(1, list, (item) => parseArticle(item, ctx)); | ||
|
||
ctx.state.data = { | ||
title: `WSJ${subTitle}`, | ||
link: url, | ||
description: `WSJ${subTitle}`, | ||
item: items, | ||
}; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
module.exports = (router) => { | ||
router.get('/:lang/:category?', require('./index')); | ||
router.get('/:lang/:category?', require('./news')); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
const asyncPool = require('tiny-async-pool'); | ||
const cheerio = require('cheerio'); | ||
const got = require('@/utils/got'); | ||
const { parseDate } = require('@/utils/parse-date'); | ||
const UA = require('@/utils/rand-user-agent')({ browser: 'chrome', os: 'android', device: 'mobile' }); | ||
|
||
// const chromeMobileUserAgent = 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36'; | ||
const parseArticle = (item, ctx) => | ||
ctx.cache.tryGet(item.link, async () => { | ||
// Fetch the AMP version | ||
const url = item.link.replace(/(?<=^https:\/\/\w+\.wsj\.com)/, '/amp'); | ||
const response = await got({ | ||
url, | ||
method: 'get', | ||
headers: { | ||
'User-Agent': UA, | ||
}, | ||
}); | ||
const html = response.data; | ||
const $ = cheerio.load(html); | ||
const content = $('.articleBody > section'); | ||
|
||
// Cover | ||
const cover = $('.articleLead > div.is-lead-inset > div.header > .img-header > div.image-container > amp-img > img'); | ||
|
||
if (cover.length > 0) { | ||
$(`<img src=${cover[0].attribs.content}>`).insertBefore(content[0].childNodes[0]); | ||
$(cover).remove(); | ||
} | ||
|
||
// Summary | ||
const summary = $('head > meta[name="description"]').attr('content'); | ||
|
||
// Metadata (categories & updatedAt) | ||
const updatedAt = $('meta[itemprop="dateModified"]').attr('content'); | ||
const publishedAt = $('meta[itemprop="datePublished"]').attr('content'); | ||
const author = $('.author > a[rel="author"]').text(); | ||
|
||
const categories = $('meta[name="keywords"]') | ||
.attr('content') | ||
.split(',') | ||
.map((c) => c.trim()); | ||
|
||
// Images | ||
content.find('amp-img').each((i, e) => { | ||
const img = $(`<img width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}" alt="${e.attribs.alt}">`); | ||
|
||
// Caption follows, no need to handle caption | ||
$(img).insertBefore(e); | ||
$(e).remove(); | ||
}); | ||
|
||
// iframes (youtube videos and interactive elements) | ||
content.find('amp-iframe').each((i, e) => { | ||
const iframe = $(`<iframe width="${e.attribs.width}" height="${e.attribs.height}" src="${e.attribs.src}">`); | ||
$(iframe).insertBefore(e); | ||
$(e).remove(); | ||
}); | ||
|
||
// Remove unwanted DOMs | ||
const unwanted_element_selectors = ['amp-ad', '.wsj-ad']; | ||
unwanted_element_selectors.forEach((selector) => { | ||
content.find(selector).each((i, e) => { | ||
$(e).remove(); | ||
}); | ||
}); | ||
|
||
// Paywall | ||
content.find('.paywall').each((i, e) => { | ||
// Caption follows, no need to handle caption | ||
$(e.childNodes).insertBefore(e); | ||
$(e).remove(); | ||
}); | ||
|
||
return { | ||
title: item.title, | ||
pubDate: parseDate(publishedAt), | ||
updated: parseDate(updatedAt), | ||
author, | ||
link: item.link, | ||
summary, | ||
description: content.html(), | ||
category: categories, | ||
icon: 'https://s.wsj.net/media/wsj_launcher-icon-4x.png', | ||
logo: 'https://vir.wsj.net/fp/assets/webpack4/img/wsj-logo-big-black.165e51cc.svg', | ||
}; | ||
}); | ||
|
||
const asyncPoolAll = async (...args) => { | ||
const results = []; | ||
for await (const result of asyncPool(...args)) { | ||
results.push(result); | ||
} | ||
return results; | ||
}; | ||
module.exports = { | ||
asyncPoolAll, | ||
parseArticle, | ||
}; |