Skip to content

Commit

Permalink
Improved the Clean HTML function
Browse files Browse the repository at this point in the history
  • Loading branch information
enricoros committed Jan 12, 2025
1 parent 50ede8b commit 4aa0d4a
Showing 1 changed file with 151 additions and 23 deletions.
174 changes: 151 additions & 23 deletions src/modules/browse/browse.router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -286,33 +286,161 @@ async function workerPuppeteer(
}


function cleanHtml(html: string) {
const $ = cheerioLoad(html);
function cleanHtml(html: string): string {
try {
const _C = cheerioLoad(html);

// 1. --unwanted elements
const unwantedSelectors = [
// core unwanted
'script', 'style', 'link', 'noscript', 'iframe', 'svg', 'canvas',

// navigation and structural elements
'nav:not(main nav)', 'aside', 'footer:not(article footer)',

// common web clutter
'.ad, .ads, .advertisement, .banner, .popup, .modal, .overlay',
'.cookie-banner, .newsletter-signup, .social-share, .comments',
'.sidebar, .widget, .carousel, .slider',

// hidden elements
'[aria-hidden="true"]',
'[hidden]',
'[style*="display: none"]',
'[style*="visibility: hidden"]',

// tracking and analytics
'[data-analytics]',
'[data-tracking]',
'[data-gtm]',

// meta elements except essential ones
'meta:not([charset], [name="viewport"], [name="description"])',
].join(', ');
_C(unwantedSelectors).remove();

// 2. --unwanted attributes tag-specific
const tagSpecificAttrs: Record<string, string[]> = {
a: ['href', 'title', 'rel'],
img: ['src', 'alt', 'title', 'width', 'height'],
video: ['src', 'controls', 'width', 'height'],
audio: ['src', 'controls'],
source: ['src', 'type'],
meta: ['charset', 'name', 'content', 'viewport'],
time: ['datetime'],
input: ['type', 'name', 'value', 'checked', 'disabled'],
button: ['type', 'disabled'],
th: ['scope', 'colspan', 'rowspan'],
td: ['colspan', 'rowspan'],
table: ['summary'],
figure: ['role'],
figcaption: [],
};
const commonAttrs = ['id', 'lang'];
_C('*').each(function() {
const el = _C(this);
if (!('tagName' in this)) return;
const tagName = this.tagName?.toLowerCase() || '';

// Get allowed attributes for this tag
const allowedAttrs = new Set([
...(tagSpecificAttrs[tagName] || []),
...commonAttrs,
]);

// -all non-allowed attributes
const attribs = Object.keys(this.attribs || {});
attribs.forEach(attr => {
if (!allowedAttrs.has(attr.toLowerCase()))
el.removeAttr(attr);
});

// Remove standard unwanted elements
$('script, style, nav, aside, noscript, iframe, svg, canvas, .ads, .comments, link[rel="stylesheet"]').remove();
// cleanup href attributes on anchors
if (tagName === 'a') {
const href = el.attr('href');
if (href) {
// -javascript: links
if (href.toLowerCase().startsWith('javascript:'))
el.removeAttr('href');
// -tracking parameters
else if (href.includes('?')) {
try {
const url = new URL(href);
const cleanParams = new URLSearchParams();
url.searchParams.forEach((value, key) => {
// keep only essential query parameters
if (!key.match(/^(utm_|fbclid|gclid|msclkid)/i))
cleanParams.append(key, value);
});
const cleanHref = `${url.origin}${url.pathname}${
cleanParams.toString() ? '?' + cleanParams.toString() : ''
}${url.hash}`;
el.attr('href', cleanHref);
} catch (e) {
// If URL parsing fails, keep original href
}
}
}
}
});

// Remove elements that might be specific to proxy services or injected by them
$('[id^="brightdata-"], [class^="brightdata-"]').remove();
// 3. --comments
_C('*').contents().filter(function() {
return this.type === 'comment';
}).remove();

// 4. --empty element
const preserveTags = new Set([
'img', 'br', 'hr', 'input', 'source', 'meta', 'link',
'area', 'base', 'col', 'embed', 'param', 'track', 'wbr',
]);
_C('*').each(function() {
const $el = _C(this);
if (!('tagName' in this)) return;
const tagName = this.tagName?.toLowerCase() || '';
const hasContent =
$el.text().trim() ||
$el.find('img, video, audio, iframe, canvas, svg').length ||
preserveTags.has(tagName) ||
(tagName === 'a' && $el.attr('href'));

if (!hasContent && !$el.children().length)
$el.remove();
});

// Remove comments
$('*').contents().filter(function() {
return this.type === 'comment';
}).remove();
// 5. simplify nested structure
_C('div > div:only-child, section > section:only-child').each(function() {
const $parent = _C(this).parent();
if ($parent.children().length === 1)
$parent.replaceWith(_C(this));
});

// Remove empty elements
$('p, div, span').each(function() {
if ($(this).text().trim() === '' && $(this).children().length === 0) {
$(this).remove();
}
});
// 6. div to paragraph conversion
_C('div').each(function() {
const $div = _C(this);
const hasBlockElements = $div.children('div, p, section, article, aside, header, footer, nav').length > 0;
if (!hasBlockElements && $div.text().trim())
$div.replaceWith(`<p>${$div.html()}</p>`);
});

// Merge consecutive paragraphs
$('p + p').each(function() {
$(this).prev().append(' ' + $(this).text());
$(this).remove();
});
// 7. clean up whitespace
_C('*').each(function() {
if (this.type === 'text') {
const text = _C(this).text().trim().replace(/\s+/g, ' ');
if (text) _C(this).text(text);
}
});

// Return the cleaned HTML
return $.html();
// 8. format final output
return _C.html()
.replace(/>\s+</g, '>\n<')
.replace(/\n\s+/g, '\n')
.replace(/^\s+|\s+$/gm, '')
.replace(/\n{3,}/g, '\n\n')
.trim();

} catch (error) {
console.error('HTML cleaning error:', error);
return html; // Return original if cleaning fails
}
}

0 comments on commit 4aa0d4a

Please sign in to comment.