forked from codelucas/newspaper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoutputformatters.py
175 lines (146 loc) · 5.95 KB
/
outputformatters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
"""
Output formatting to text via lxml xpath nodes abstracted in this file.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
from html import unescape
import logging
from .text import innerTrim
log = logging.getLogger(__name__)
class OutputFormatter(object):
def __init__(self, config):
self.top_node = None
self.config = config
self.parser = self.config.get_parser()
self.language = config.language
self.stopwords_class = config.stopwords_class
def update_language(self, meta_lang):
'''Required to be called before the extraction process in some
cases because the stopwords_class has to set incase the lang
is not latin based
'''
if meta_lang:
self.language = meta_lang
self.stopwords_class = \
self.config.get_stopwords_class(meta_lang)
def get_top_node(self):
return self.top_node
def get_formatted(self, top_node):
"""Returns the body text of an article, and also the body article
html if specified. Returns in (text, html) form
"""
self.top_node = top_node
html, text = '', ''
self.remove_negativescores_nodes()
if self.config.keep_article_html:
html = self.convert_to_html()
self.links_to_text()
self.add_newline_to_br()
self.add_newline_to_li()
self.replace_with_text()
self.remove_empty_tags()
self.remove_trailing_media_div()
text = self.convert_to_text()
# print(self.parser.nodeToString(self.get_top_node()))
return (text, html)
def convert_to_text(self):
txts = []
for node in list(self.get_top_node()):
try:
txt = self.parser.getText(node)
except ValueError as err: # lxml error
log.info('%s ignoring lxml node error: %s', __title__, err)
txt = None
if txt:
txt = unescape(txt)
txt_lis = innerTrim(txt).split(r'\n')
txt_lis = [n.strip(' ') for n in txt_lis]
txts.extend(txt_lis)
return '\n\n'.join(txts)
def convert_to_html(self):
cleaned_node = self.parser.clean_article_html(self.get_top_node())
return self.parser.nodeToString(cleaned_node)
def add_newline_to_br(self):
for e in self.parser.getElementsByTag(self.top_node, tag='br'):
e.text = r'\n'
def add_newline_to_li(self):
for e in self.parser.getElementsByTag(self.top_node, tag='ul'):
li_list = self.parser.getElementsByTag(e, tag='li')
for li in li_list[:-1]:
li.text = self.parser.getText(li) + r'\n'
for c in self.parser.getChildren(li):
self.parser.remove(c)
def links_to_text(self):
"""Cleans up and converts any nodes that should be considered
text into text.
"""
self.parser.stripTags(self.get_top_node(), 'a')
def remove_negativescores_nodes(self):
"""If there are elements inside our top node that have a
negative gravity score, let's give em the boot.
"""
gravity_items = self.parser.css_select(
self.top_node, "*[gravityScore]")
for item in gravity_items:
score = self.parser.getAttribute(item, 'gravityScore')
score = float(score) if score else 0
if score < 1:
item.getparent().remove(item)
def replace_with_text(self):
"""
Replace common tags with just text so we don't have any crazy
formatting issues so replace <br>, <i>, <strong>, etc....
With whatever text is inside them.
code : http://lxml.de/api/lxml.etree-module.html#strip_tags
"""
self.parser.stripTags(
self.get_top_node(), 'b', 'strong', 'i', 'br', 'sup')
def remove_empty_tags(self):
"""It's common in top_node to exit tags that are filled with data
within properties but not within the tags themselves, delete them
"""
all_nodes = self.parser.getElementsByTags(
self.get_top_node(), ['*'])
all_nodes.reverse()
for el in all_nodes:
tag = self.parser.getTag(el)
text = self.parser.getText(el)
if (tag != 'br' or text != '\\r') \
and not text \
and len(self.parser.getElementsByTag(
el, tag='object')) == 0 \
and len(self.parser.getElementsByTag(
el, tag='embed')) == 0:
self.parser.remove(el)
def remove_trailing_media_div(self):
"""Punish the *last top level* node in the top_node if it's
DOM depth is too deep. Many media non-content links are
eliminated: "related", "loading gallery", etc. It skips removal if
last top level node's class is one of NON_MEDIA_CLASSES.
"""
NON_MEDIA_CLASSES = ('zn-body__read-all', )
def get_depth(node, depth=1):
"""Computes depth of an lxml element via BFS, this would be
in parser if it were used anywhere else besides this method
"""
children = self.parser.getChildren(node)
if not children:
return depth
max_depth = 0
for c in children:
e_depth = get_depth(c, depth + 1)
if e_depth > max_depth:
max_depth = e_depth
return max_depth
top_level_nodes = self.parser.getChildren(self.get_top_node())
if len(top_level_nodes) < 3:
return
last_node = top_level_nodes[-1]
last_node_class = self.parser.getAttribute(last_node, 'class')
if last_node_class in NON_MEDIA_CLASSES:
return
if get_depth(last_node) >= 2:
self.parser.remove(last_node)