forked from wikimedia/pywikibot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextlib.py
2289 lines (1933 loc) · 83.4 KB
/
textlib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Functions for manipulating wiki-text."""
#
# (C) Pywikibot team, 2008-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations
import itertools
import re
from collections import OrderedDict
from collections.abc import Sequence
from contextlib import suppress
from html.parser import HTMLParser
from typing import NamedTuple
import pywikibot
from pywikibot.backports import Callable, Container, Iterable, Match
from pywikibot.backports import OrderedDict as OrderedDictType
from pywikibot.backports import Pattern
from pywikibot.backports import Sequence as SequenceType
from pywikibot.backports import pairwise
from pywikibot.exceptions import InvalidTitleError, SiteDefinitionError
from pywikibot.family import Family
from pywikibot.time import TZoneFixedOffset
from pywikibot.tools import (
ModuleDeprecationWrapper,
deprecated,
deprecated_args,
first_lower,
first_upper,
)
from pywikibot.userinterfaces.transliteration import NON_LATIN_DIGITS
try:
import wikitextparser
except ImportError:
import mwparserfromhell as wikitextparser
# cache for replaceExcept to avoid recompile or regexes each call
_regex_cache: dict[str, Pattern[str]] = {}
# The regex below collects nested templates, providing simpler
# identification of templates used at the top-level of wikitext.
# It doesn't match {{{1|...}}}, however it also does not match templates
# with a numerical name. e.g. {{1|..}}. It will correctly match {{{x}} as
# being {{x}} with leading '{' left in the wikitext.
# Prefix msg: is not included in the 'name' group, but all others are
# included for backwards compatibility with TEMP_REGEX.
# Only parser functions using # are excluded.
# When more than two levels of templates are found, this regex will
# capture from the beginning of the first {{ to the end of the last }},
# with wikitext between templates as part of the parameters of the first
# template in the wikitext.
# This ensures it fallsback to a safe mode for replaceExcept, as it
# ensures that any replacement will not occur within template text.
NESTED_TEMPLATE_REGEX = re.compile(r"""
{{\s*(?:msg:\s*)?
(?P<name>[^{\|#0-9][^{\|#]*?)\s*
(?:\|(?P<params> [^{]*?
(({{{[^{}]+?}}}
|{{[^{}]+?}}
|{[^{}]*?}
) [^{]*?
)*?
)?
)?
}}
|
(?P<unhandled_depth>{{\s*[^{\|#0-9][^{\|#]*?\s* [^{]* {{ .* }})
""", re.VERBOSE | re.DOTALL)
# The following regex supports wikilinks anywhere after the first pipe
# and correctly matches the end of the file link if the wikilink contains
# [[ or ]].
# The namespace names must be substituted into this regex.
# e.g. FILE_LINK_REGEX % 'File'
# or FILE_LINK_REGEX % '|'.join(site.namespaces[6])
FILE_LINK_REGEX = r"""
\[\[\s*
(?:%s) # namespace aliases
\s*:
(?=(?P<filename>
[^]|]*
))(?P=filename)
(
\|
(
(
(?=(?P<inner_link>
\[\[.*?\]\]
))(?P=inner_link)
)?
(?=(?P<other_chars>
[^\[\]]*
))(?P=other_chars)
|
(?=(?P<not_wikilink>
\[[^]]*\]
))(?P=not_wikilink)
)*?
)??
\]\]
"""
# Used in TimeStripper. When a timestamp-like line has longer gaps
# than this between year, month, etc in it, then the line will not be
# considered to contain a timestamp.
TIMESTAMP_GAP_LIMIT = 10
def to_local_digits(phrase: str | int, lang: str) -> str:
"""
Change Latin digits based on language to localized version.
Be aware that this function only works for several languages, and that it
returns an unchanged string if an unsupported language is given.
.. versionchanged:: 7.5
always return a string even `phrase` is an int.
:param phrase: The phrase to convert to localized numerical
:param lang: language code
:return: The localized version
"""
digits = NON_LATIN_DIGITS.get(lang)
phrase = str(phrase)
if digits:
trans = str.maketrans('0123456789', digits)
phrase = phrase.translate(trans)
return phrase
def to_latin_digits(phrase: str,
langs: SequenceType[str] | str | None = None) -> str:
"""Change non-latin digits to latin digits.
.. versionadded:: 7.0
:param phrase: The phrase to convert to latin numerical.
:param langs: Language codes. If langs parameter is None, use all
known languages to convert.
:return: The string with latin digits
"""
if langs is None:
langs = NON_LATIN_DIGITS.keys()
elif isinstance(langs, str):
langs = [langs]
digits = [NON_LATIN_DIGITS[key] for key in langs
if key in NON_LATIN_DIGITS]
if digits:
trans = str.maketrans(''.join(digits), '0123456789' * len(digits))
phrase = phrase.translate(trans)
return phrase
def case_escape(case: str, string: str, *, underscore: bool = False) -> str:
"""Return an escaped regex pattern which depends on 'first-letter' case.
.. versionadded:: 7.0
.. versionchanged:: 8.4
Added the optional *underscore* parameter.
:param case: if `case` is 'first-letter', the regex contains an
inline re.IGNORECASE flag for the first letter
:param underscore: if True, expand the regex to detect spaces and
underscores which are interchangeable and collapsible
"""
if case == 'first-letter':
pattern = f'(?i:{string[:1]}){re.escape(string[1:])}'
else:
pattern = re.escape(string)
if underscore:
pattern = re.sub(r'_|\\ ', '[_ ]+', pattern)
return pattern
class MultiTemplateMatchBuilder:
"""Build template matcher."""
def __init__(self, site) -> None:
"""Initializer."""
self.site = site
def pattern(self, template, flags=re.DOTALL):
"""Return a compiled regex to match template."""
# TODO: add ability to also match contents within the template
# TODO: add option for template to be None to match any template
# TODO: merge regex with NESTED_TEMPLATE_REGEX
namespace = self.site.namespaces[10]
if isinstance(template, pywikibot.Page):
if template.namespace() == 10:
old = template.title(with_ns=False)
else:
raise ValueError(
f'{template} is not a template Page object')
elif isinstance(template, str):
old = template
else:
raise ValueError(
f'{template!r} is not a valid template')
pattern = case_escape(namespace.case, old)
# namespaces may be any mixed case
namespaces = [ignore_case(ns) for ns in namespace]
namespaces.append(ignore_case('msg'))
pattern = re.sub(r'_|\\ ', r'[_ ]', pattern)
templateRegexP = (
r'{{\s*(%(namespace)s:)?%(pattern)s'
r'(?P<parameters>\s*\|[^{]+?'
r'((({{{[^{}]+?}}}|{{[^{}]+?}}|{[^{}]*?})[^{]*?)*?)?'
r'|)\s*}}'
) % {'namespace': ':|'.join(namespaces), 'pattern': pattern}
templateRegex = re.compile(templateRegexP, flags)
return templateRegex
def search_any_predicate(self, templates):
"""Return a predicate that matches any template."""
predicates = [self.pattern(template).search for template in templates]
return lambda text: any(predicate(text) for predicate in predicates)
def ignore_case(string: str) -> str:
"""Return a case-insensitive pattern for the string.
.. versionchanged:: 7.2
`_ignore_case` becomes a public method
"""
return ''.join(
f'[{c}{s}]' if c != s else c
for s, c in zip(string, string.swapcase()))
def _tag_pattern(tag_name: str) -> str:
"""Return a tag pattern for the given tag name."""
return (
r'<{0}(?:>|\s+[^>]*(?<!/)>)' # start tag
r'[\s\S]*?' # contents
r'</{0}\s*>' # end tag
.format(ignore_case(tag_name)))
def _tag_regex(tag_name: str):
"""Return a compiled tag regex for the given tag name."""
return re.compile(_tag_pattern(tag_name))
def _create_default_regexes() -> None:
"""Fill (and possibly overwrite) ``_regex_cache`` with default regexes.
The following keys are provided: ``category``, ``comment``, ``file``,
``header``, ``hyperlink``, ``interwiki``, ``invoke``, ``link``,
``pagelist``, ``property``, ``startcolon``, ``startspace``, ``table``,
``template``.
:meta public:
"""
_regex_cache.update({
# categories
'category': (r'\[\[ *(?:%s)\s*:.*?\]\]',
lambda site: '|'.join(site.namespaces[14])),
'comment': re.compile(r'<!--[\s\S]*?-->'),
# files
'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])),
# section headers
'header': re.compile(
r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*'
r'(=(?:[^\n]|<!--[\s\S]*?-->)+=)'
r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'),
# external links
'hyperlink': compileLinkR(),
# also finds links to foreign sites with preleading ":"
'interwiki': (
r'\[\[:?(%s)\s?:[^\]]*\]\]\s*',
lambda site: '|'.join(
ignore_case(i) for i in site.validLanguageLinks()
+ list(site.family.obsolete.keys()))),
# Module invocations (currently only Lua)
'invoke': (
r'\{\{\s*\#(?:%s):[\s\S]*?\}\}',
lambda site: '|'.join(
ignore_case(mw) for mw in site.getmagicwords('invoke'))),
# this matches internal wikilinks, but also interwiki, categories, and
# images.
'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'),
# pagelist tag (used in Proofread extension).
'pagelist': re.compile(r'<{}[\s\S]*?/>'
.format(ignore_case('pagelist'))),
# Wikibase property inclusions
'property': (
r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}',
lambda site: '|'.join(
ignore_case(mw) for mw in site.getmagicwords('property'))),
# lines that start with a colon or more will be indented
'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'),
# lines that start with a space are shown in a monospace font and
# have whitespace preserved.
'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'),
# tables often have whitespace that is used to improve wiki
# source code readability.
# TODO: handle nested tables.
'table': re.compile(
r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')),
'template': NESTED_TEMPLATE_REGEX,
})
def get_regexes(
keys: str | Iterable[str],
site: pywikibot.site.BaseSite | None = None
) -> list[Pattern[str]]:
"""Fetch compiled regexes.
.. versionchanged:: 8.2
``_get_regexes`` becomes a public function.
*keys* may be a single string; *site* is optional.
:param keys: a single key or an iterable of keys whose regex pattern
should be given
:param site: a BaseSite object needed for ``category``, ``file``,
``interwiki``, ``invoke`` and ``property`` keys
:raises ValueError: site cannot be None.
"""
if not _regex_cache:
_create_default_regexes()
if isinstance(keys, str):
keys = [keys]
result = []
for exc in keys:
if not isinstance(exc, str):
# assume it's a regular expression
result.append(exc)
continue
# assume the string is a reference to a standard regex above,
# which may not yet have a site specific re compiled.
if exc not in _regex_cache:
# nowiki, noinclude, includeonly, timeline, math and other
# extensions
_regex_cache[exc] = _tag_regex(exc)
result.append(_regex_cache[exc])
elif not isinstance(_regex_cache[exc], tuple):
result.append(_regex_cache[exc])
else:
if not site and exc in ('interwiki', 'property', 'invoke',
'category', 'file'):
raise ValueError(f'site cannot be None for the {exc!r} regex')
if (exc, site) not in _regex_cache:
re_text, re_var = _regex_cache[exc]
_regex_cache[(exc, site)] = re.compile(
re_text % re_var(site), re.VERBOSE)
result.append(_regex_cache[(exc, site)])
# handle aliases
if exc == 'source':
result.append(_tag_regex('syntaxhighlight'))
elif exc == 'syntaxhighlight':
result.append(_tag_regex('source'))
elif exc == 'chem':
result.append(_tag_regex('ce'))
elif exc == 'math':
result.append(_tag_regex('chem'))
result.append(_tag_regex('ce'))
return result
def replaceExcept(text: str,
old: str | Pattern[str],
new: str | Callable[[Match[str]], str],
exceptions: SequenceType[str | Pattern[str]],
caseInsensitive: bool = False,
allowoverlap: bool = False,
marker: str = '',
site: pywikibot.site.BaseSite | None = None,
count: int = 0) -> str:
"""
Return text with *old* replaced by *new*, ignoring specified types of text.
Skip occurrences of *old* within *exceptions*; e.g. within nowiki
tags or HTML comments. If *caseInsensitive* is true, then use case
insensitive regex matching. If *allowoverlap* is true, overlapping
occurrences are all replaced
.. caution:: Watch out when using *allowoverlap*, it might lead to
infinite loops!
:param text: text to be modified
:param old: a compiled or uncompiled regular expression
:param new: a string (which can contain regular expression
references), or a function which takes a match object as
parameter. See parameter *repl* of ``re.sub()``.
:param exceptions: a list of strings or already compiled regex
objects which signal what to leave out. List of strings might be
like ``['math', 'table', 'template']`` for example.
:param marker: a string that will be added to the last replacement;
if nothing is changed, it is added at the end
:param count: how many replacements to do at most. See parameter
*count* of ``re.sub()``.
"""
# if we got a string, compile it as a regular expression
if isinstance(old, str):
old = re.compile(old, flags=re.IGNORECASE if caseInsensitive else 0)
# early termination if not relevant
if not old.search(text):
return text + marker
dontTouchRegexes = get_regexes(exceptions, site)
index = 0
replaced = 0
markerpos = len(text)
while not count or replaced < count:
if index > len(text):
break
match = old.search(text, index)
if not match:
# nothing left to replace
break
# check which exception will occur next.
nextExceptionMatch = None
for dontTouchR in dontTouchRegexes:
excMatch = dontTouchR.search(text, index)
if excMatch and (
nextExceptionMatch is None
or excMatch.start() < nextExceptionMatch.start()):
nextExceptionMatch = excMatch
if nextExceptionMatch is not None \
and nextExceptionMatch.start() <= match.start():
# an HTML comment or text in nowiki tags stands before the next
# valid match. Skip.
index = nextExceptionMatch.end()
continue
# We found a valid match. Replace it.
if callable(new):
# the parameter new can be a function which takes the match
# as a parameter.
replacement = new(match)
else:
# it is not a function, but a string.
# it is a little hack to make \n work. It would be better
# to fix it previously, but better than nothing.
new = new.replace('\\n', '\n')
# We cannot just insert the new string, as it may contain regex
# group references such as \2 or \g<name>.
# On the other hand, this approach does not work because it
# can't handle lookahead or lookbehind (see bug T123185).
# So we have to process the group references manually.
replacement = ''
group_regex = re.compile(r'\\(\d+)|\\g<(.+?)>')
last = 0
for group_match in group_regex.finditer(new):
group_id = group_match[1] or group_match[2]
with suppress(ValueError):
group_id = int(group_id)
try:
replacement += new[last:group_match.start()]
replacement += match[group_id] or ''
except IndexError:
raise IndexError(f'Invalid group reference: {group_id}\n'
f'Groups found: {match.groups()}')
last = group_match.end()
replacement += new[last:]
text = text[:match.start()] + replacement + text[match.end():]
# continue the search on the remaining text
if allowoverlap:
index = match.start() + 1
else:
index = match.start() + len(replacement)
if not match.group():
# When the regex allows to match nothing, shift by one char
index += 1
markerpos = match.start() + len(replacement)
replaced += 1
return text[:markerpos] + marker + text[markerpos:]
def removeDisabledParts(text: str,
tags: Iterable | None = None,
include: Container | None = None,
site: pywikibot.site.BaseSite | None = None
) -> str:
"""
Return text without portions where wiki markup is disabled.
Parts that will be removed by default are:
* HTML comments
* nowiki tags
* pre tags
* includeonly tags
* source and syntaxhighlight tags
.. versionchanged:: 7.0
the order of removals will correspond to the tags argument
if provided as an ordered collection (list, tuple)
:param tags: The exact set of parts which should be removed using
keywords from :func:`get_regexes`.
:param include: Or, in alternative, default parts that shall not
be removed.
:param site: Site to be used for site-dependent regexes. Default
disabled parts listed above do not need it.
:return: text stripped from disabled parts.
"""
if not tags:
tags = ['comment', 'includeonly', 'nowiki', 'pre', 'syntaxhighlight']
# avoid set(tags) because sets are internally ordered using the hash
# which for strings is salted per Python process => the output of
# this function would likely be different per script run because
# the replacements would be done in different order and the disabled
# parts may overlap and suppress each other
# see https://docs.python.org/3/reference/datamodel.html#object.__hash__
# ("Note" at the end of the section)
if include:
tags = [tag for tag in tags if tag not in include]
regexes = get_regexes(tags, site)
for regex in regexes:
text = regex.sub('', text)
return text
def removeHTMLParts(text: str, keeptags: list[str] | None = None) -> str:
"""
Return text without portions where HTML markup is disabled.
Parts that can/will be removed are --
* HTML and all wiki tags
The exact set of parts which should NOT be removed can be passed as the
'keeptags' parameter, which defaults to ['tt', 'nowiki', 'small', 'sup'].
"""
# try to merge with 'removeDisabledParts()' above into one generic function
# thanks to:
# https://www.hellboundhackers.org/articles/read-article.php?article_id=841
parser = _GetDataHTML()
if keeptags is None:
keeptags = ['tt', 'nowiki', 'small', 'sup']
with parser:
parser.keeptags = keeptags
parser.feed(text)
return parser.textdata
class _GetDataHTML(HTMLParser):
"""HTML parser which removes html tags except they are listed in keeptags.
This class is also a context manager which closes itself at exit time.
.. seealso:: :pylib:`html.parser`
"""
textdata = ''
keeptags: list[str] = []
def __enter__(self) -> None:
pass
def __exit__(self, *exc_info) -> None:
self.close()
def handle_data(self, data) -> None:
"""Add data to text."""
self.textdata += data
def handle_starttag(self, tag, attrs) -> None:
"""Add start tag to text if tag should be kept."""
if tag in self.keeptags:
self.textdata += f'<{tag}>'
def handle_endtag(self, tag) -> None:
"""Add end tag to text if tag should be kept."""
if tag in self.keeptags:
self.textdata += f'</{tag}>'
def isDisabled(text: str, index: int, tags=None) -> bool:
"""
Return True if text[index] is disabled, e.g. by a comment or nowiki tags.
For the tags parameter, see :py:obj:`removeDisabledParts`.
"""
# Find a marker that is not already in the text.
marker = findmarker(text)
text = text[:index] + marker + text[index:]
text = removeDisabledParts(text, tags)
return marker not in text
def findmarker(text: str, startwith: str = '@@',
append: str | None = None) -> str:
"""Find a string which is not part of text."""
if not append:
append = '@'
mymarker = startwith
while mymarker in text:
mymarker += append
return mymarker
def expandmarker(text: str, marker: str = '', separator: str = '') -> str:
"""
Return a marker expanded whitespace and the separator.
It searches for the first occurrence of the marker and gets the combination
of the separator and whitespace directly before it.
:param text: the text which will be searched.
:param marker: the marker to be searched.
:param separator: the separator string allowed before the marker. If empty
it won't include whitespace too.
:return: the marker with the separator and whitespace from the text in
front of it. It'll be just the marker if the separator is empty.
"""
# set to remove any number of separator occurrences plus arbitrary
# whitespace before, after, and between them,
# by allowing to include them into marker.
if separator:
firstinmarker = text.find(marker)
firstinseparator = firstinmarker
lenseparator = len(separator)
striploopcontinue = True
while firstinseparator > 0 and striploopcontinue:
striploopcontinue = False
if (firstinseparator >= lenseparator
and separator == text[firstinseparator
- lenseparator:firstinseparator]):
firstinseparator -= lenseparator
striploopcontinue = True
elif text[firstinseparator - 1] < ' ':
firstinseparator -= 1
striploopcontinue = True
marker = text[firstinseparator:firstinmarker] + marker
return marker
def replace_links(text: str, replace, site: pywikibot.site.BaseSite) -> str:
"""Replace wikilinks selectively.
The text is searched for a link and on each link it replaces the text
depending on the result for that link. If the result is just None it skips
that link. When it's False it unlinks it and just inserts the label. When
it is a Link instance it'll use the target, section and label from that
Link instance. If it's a Page instance it'll use just the target from the
replacement and the section and label from the original link.
If it's a string and the replacement was a sequence it converts it into a
Page instance. If the replacement is done via a callable it'll use it like
unlinking and directly replace the link with the text itself.
If either the section or label should be used the replacement can be a
function which returns a Link instance and copies the value which should
remaining.
.. versionchanged:: 7.0
`site` parameter is mandatory
:param text: the text in which to replace links
:param replace: either a callable which reacts like described above.
The callable must accept four parameters link, text, groups, rng and
allows for user interaction. The groups are a dict containing 'title',
'section', 'label' and 'linktrail' and the rng are the start and end
position of the link. The 'label' in groups contains everything after
the first pipe which might contain additional data which is used in
File namespace for example.
Alternatively it can be a sequence containing two items where the first
must be a Link or Page and the second has almost the same meaning as
the result by the callable. It'll convert that into a callable where
the first item (the Link or Page) has to be equal to the found link and
in that case it will apply the second value from the sequence.
:type replace: sequence of pywikibot.Page/pywikibot.Link/str or
callable
:param site: a Site object to use. It should match the origin or
target site of the text
:raises TypeError: missing positional argument 'site'
:raises ValueError: Wrong site type
:raises ValueError: Wrong replacement number
:raises ValueError: Wrong replacement types
"""
def to_link(source):
"""Return the link from source when it's a Page otherwise itself."""
if isinstance(source, pywikibot.Page):
return source._link
if isinstance(source, str):
return pywikibot.Link(source, site)
return source
def replace_callable(link, text, groups, rng):
if replace_list[0] == link:
return replace_list[1]
return None
def check_classes(replacement):
"""Normalize the replacement into a list."""
if not isinstance(replacement, (pywikibot.Page, pywikibot.Link)):
raise ValueError('The replacement must be None, False, '
'a sequence, a Link or a str but '
'is "{}"'.format(type(replacement)))
def title_section(link) -> str:
title = link.title
if link.section:
title += '#' + link.section
return title
if not isinstance(site, pywikibot.site.BaseSite):
raise ValueError('The "site" argument must be a BaseSite not {}.'
.format(type(site).__name__))
if isinstance(replace, Sequence):
if len(replace) != 2:
raise ValueError('When used as a sequence, the "replace" '
'argument must contain exactly 2 items.')
replace_list = [to_link(replace[0]), replace[1]]
if not isinstance(replace_list[0], pywikibot.Link):
raise ValueError(
'The original value must be either str, Link or Page '
'but is "{}"'.format(type(replace_list[0])))
if replace_list[1] is not False and replace_list[1] is not None:
if isinstance(replace_list[1], str):
replace_list[1] = pywikibot.Page(site, replace_list[1])
check_classes(replace_list[0])
replace = replace_callable
linktrail = site.linktrail()
link_pattern = re.compile(
r'\[\[(?P<title>.*?)(#(?P<section>.*?))?(\|(?P<label>.*?))?\]\]'
r'(?P<linktrail>{})'.format(linktrail))
extended_label_pattern = re.compile(fr'(.*?\]\])({linktrail})')
linktrail = re.compile(linktrail)
curpos = 0
# This loop will run until we have finished the current page
while True:
m = link_pattern.search(text, pos=curpos)
if not m:
break
m_title = m['title'].strip()
# Ignore links to sections of the same page
if not m_title:
curpos = m.end()
continue
# Ignore interwiki links
if site.isInterwikiLink(m_title) and not m_title.startswith(':'):
curpos = m.end()
continue
groups = m.groupdict()
if groups['label'] and '[[' in groups['label']:
# TODO: Work on the link within the label too
# A link within a link, extend the label to the ]] after it
extended_match = extended_label_pattern.search(text, pos=m.end())
if not extended_match:
# TODO: Unclosed link label, what happens there?
curpos = m.end()
continue
groups['label'] += groups['linktrail'] + extended_match[1]
groups['linktrail'] = extended_match[2]
end = extended_match.end()
else:
end = m.end()
start = m.start()
# Since this point the m variable shouldn't be used as it may not
# contain all contents
del m
try:
link = pywikibot.Link.create_separated(
groups['title'], site, section=groups['section'],
label=groups['label'])
except (SiteDefinitionError, InvalidTitleError):
# unrecognized iw prefix or invalid title
curpos = end
continue
# Check whether the link found should be replaced.
# Either None, False or tuple(Link, bool)
new_link = replace(link, text, groups.copy(), (start, end))
if new_link is None:
curpos = end
continue
# The link looks like this:
# [[page_title|new_label]]new_linktrail
page_title = groups['title']
new_label = groups['label']
if not new_label:
# or like this: [[page_title]]new_linktrail
new_label = page_title
# remove preleading ":" from the link text
if new_label[0] == ':':
new_label = new_label[1:]
new_linktrail = groups['linktrail']
if new_linktrail:
new_label += new_linktrail
if new_link is False:
# unlink - we remove the section if there's any
assert isinstance(new_label, str), 'link text must be str.'
new_link = new_label
if isinstance(new_link, str):
text = text[:start] + new_link + text[end:]
# Make sure that next time around we will not find this same hit.
curpos = start + len(new_link)
continue
if isinstance(new_link, bytes):
raise ValueError('The result must be str and not bytes.')
# Verify that it's either Link, Page or str
check_classes(new_link)
# Use section and label if it's a Link and not otherwise
if isinstance(new_link, pywikibot.Link):
is_link = True
else:
new_link = new_link._link
is_link = False
new_title = new_link.canonical_title()
# Make correct langlink if needed
if new_link.site != site:
new_title = ':' + new_link.site.code + ':' + new_title
if is_link:
# Use link's label
new_label = new_link.anchor
must_piped = new_label is not None
new_section = new_link.section
else:
must_piped = True
new_section = groups['section']
if new_section:
new_title += '#' + new_section
if new_label is None:
new_label = new_title
# Parse the link text and check if it points to the same page
parsed_new_label = pywikibot.Link(new_label, new_link.site)
try:
parsed_new_label.parse()
except InvalidTitleError:
pass
else:
parsed_link_title = title_section(parsed_new_label)
new_link_title = title_section(new_link)
# compare title, but only with parts if linktrail works
if not linktrail.sub('',
parsed_link_title[len(new_link_title):]):
# TODO: This must also compare everything that was used as a
# prefix (in case insensitive)
must_piped = (
not parsed_link_title.startswith(new_link_title)
or parsed_new_label.namespace != new_link.namespace)
if must_piped:
new_text = f'[[{new_title}|{new_label}]]'
else:
new_text = (f'[[{new_label[:len(new_title)]}]]'
f'{new_label[len(new_title):]}')
text = text[:start] + new_text + text[end:]
# Make sure that next time around we will not find this same hit.
curpos = start + len(new_text)
return text
def add_text(text: str, add: str, *, site=None) -> str:
"""Add text to a page content above categories and interwiki.
.. versionadded:: 6.4
:param text: The page content to add text to.
:param add: Text to add.
:param site: The site that the text is coming from. Required for
reorder of categories and interlanguage links. Te default site
is used otherwise.
:type site: pywikibot.Site
"""
# Translating the \\n (e.g. from command line) into binary \n
add = add.replace('\\n', '\n')
# Getting the categories
categories_inside = getCategoryLinks(text, site)
# Deleting the categories
text = removeCategoryLinks(text, site)
# Getting the interwiki
interwiki_inside = getLanguageLinks(text, site)
# Removing the interwiki
text = removeLanguageLinks(text, site)
# Adding the text
text += '\n' + add
# Reputting the categories
text = replaceCategoryLinks(text, categories_inside, site, add_only=True)
# Adding the interwiki
return replaceLanguageLinks(text, interwiki_inside, site)
# -------------------------------
# Functions dealing with sections
# -------------------------------
#: Head pattern
HEAD_PATTERN = re.compile(r'(={1,6}).+\1', re.DOTALL)
TITLE_PATTERN = re.compile("'{3}([^']+)'{3}")
class _Heading(NamedTuple):
text: str
start: int
end: int
class Section(NamedTuple):
"""A namedtuple as part of :class:`Content` describing a page section.
.. versionchanged:: 8.2
``_Section`` becomes a public class.
"""
title: str #: section title including equal signs
content: str #: section content
@property
def level(self) -> int:
"""Return the section level.
.. versionadded:: 8.2
"""
m = HEAD_PATTERN.match(self.title)
return len(m[1])
@property
def heading(self) -> str:
"""Return the section title without equal signs.
.. versionadded:: 8.2
"""
level = self.level
return self.title[level:-level].strip()
class Content(NamedTuple):
"""A namedtuple as result of :func:`extract_sections` holding page content.
.. versionchanged:: 8.2
``_Content`` becomes a public class.
"""
header: str #: the page header
sections: list[Section] #: the page sections
footer: str #: the page footer
@property
def title(self) -> str:
"""Return the first main title found on the page.
The first main title is anything enclosed within triple quotes.
.. versionadded:: 8.2
"""
m = TITLE_PATTERN.search(self.header)
return m[1].strip() if m else ''
def _extract_headings(text: str) -> list[_Heading]:
"""Return _Heading objects."""