# -*- coding: utf-8 -*- """ Output formatting to text via lxml xpath nodes abstracted in this file. """ __title__ = 'newspaper' __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' from html import unescape import logging from .text import innerTrim log = logging.getLogger(__name__) class OutputFormatter(object): def __init__(self, config): self.top_node = None self.config = config self.parser = self.config.get_parser() self.language = config.language self.stopwords_class = config.stopwords_class def update_language(self, meta_lang): '''Required to be called before the extraction process in some cases because the stopwords_class has to set incase the lang is not latin based ''' if meta_lang: self.language = meta_lang self.stopwords_class = \ self.config.get_stopwords_class(meta_lang) def get_top_node(self): return self.top_node def get_formatted(self, top_node): """Returns the body text of an article, and also the body article html if specified. Returns in (text, html) form """ self.top_node = top_node html, text = '', '' self.remove_negativescores_nodes() if self.config.keep_article_html: html = self.convert_to_html() self.links_to_text() self.add_newline_to_br() self.add_newline_to_li() self.replace_with_text() self.remove_empty_tags() self.remove_trailing_media_div() text = self.convert_to_text() # print(self.parser.nodeToString(self.get_top_node())) return (text, html) def convert_to_text(self): txts = [] for node in list(self.get_top_node()): try: txt = self.parser.getText(node) except ValueError as err: # lxml error log.info('%s ignoring lxml node error: %s', __title__, err) txt = None if txt: txt = unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txt_lis = [n.strip(' ') for n in txt_lis] txts.extend(txt_lis) return '\n\n'.join(txts) def convert_to_html(self): cleaned_node = self.parser.clean_article_html(self.get_top_node()) return self.parser.nodeToString(cleaned_node) def add_newline_to_br(self): for e in self.parser.getElementsByTag(self.top_node, tag='br'): e.text = r'\n' def add_newline_to_li(self): for e in self.parser.getElementsByTag(self.top_node, tag='ul'): li_list = self.parser.getElementsByTag(e, tag='li') for li in li_list[:-1]: li.text = self.parser.getText(li) + r'\n' for c in self.parser.getChildren(li): self.parser.remove(c) def links_to_text(self): """Cleans up and converts any nodes that should be considered text into text. """ self.parser.stripTags(self.get_top_node(), 'a') def remove_negativescores_nodes(self): """If there are elements inside our top node that have a negative gravity score, let's give em the boot. """ gravity_items = self.parser.css_select( self.top_node, "*[gravityScore]") for item in gravity_items: score = self.parser.getAttribute(item, 'gravityScore') score = float(score) if score else 0 if score < 1: item.getparent().remove(item) def replace_with_text(self): """ Replace common tags with just text so we don't have any crazy formatting issues so replace
, , , etc.... With whatever text is inside them. code : http://lxml.de/api/lxml.etree-module.html#strip_tags """ self.parser.stripTags( self.get_top_node(), 'b', 'strong', 'i', 'br', 'sup') def remove_empty_tags(self): """It's common in top_node to exit tags that are filled with data within properties but not within the tags themselves, delete them """ all_nodes = self.parser.getElementsByTags( self.get_top_node(), ['*']) all_nodes.reverse() for el in all_nodes: tag = self.parser.getTag(el) text = self.parser.getText(el) if (tag != 'br' or text != '\\r') \ and not text \ and len(self.parser.getElementsByTag( el, tag='object')) == 0 \ and len(self.parser.getElementsByTag( el, tag='embed')) == 0: self.parser.remove(el) def remove_trailing_media_div(self): """Punish the *last top level* node in the top_node if it's DOM depth is too deep. Many media non-content links are eliminated: "related", "loading gallery", etc. It skips removal if last top level node's class is one of NON_MEDIA_CLASSES. """ NON_MEDIA_CLASSES = ('zn-body__read-all', ) def get_depth(node, depth=1): """Computes depth of an lxml element via BFS, this would be in parser if it were used anywhere else besides this method """ children = self.parser.getChildren(node) if not children: return depth max_depth = 0 for c in children: e_depth = get_depth(c, depth + 1) if e_depth > max_depth: max_depth = e_depth return max_depth top_level_nodes = self.parser.getChildren(self.get_top_node()) if len(top_level_nodes) < 3: return last_node = top_level_nodes[-1] last_node_class = self.parser.getAttribute(last_node, 'class') if last_node_class in NON_MEDIA_CLASSES: return if get_depth(last_node) >= 2: self.parser.remove(last_node)