-
-
Notifications
You must be signed in to change notification settings - Fork 459
/
Copy pathspellcheck.py
116 lines (88 loc) · 3.06 KB
/
spellcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import codecs
import collections
import os
from . import resources
__copyright__ = """
2012 Peter Norvig (http://norvig.com/spell-correct.html)
2013-2018 David Aguilar <davvid@gmail.com>
"""
ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
def train(features, model):
for f in features:
model[f] += 1
return model
def edits1(word):
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in splits if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
replaces = [a + c + b[1:] for a, b in splits for c in ALPHABET if b]
inserts = [a + c + b for a, b in splits for c in ALPHABET]
return set(deletes + transposes + replaces + inserts)
def known_edits2(word, words):
return {e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in words}
def known(word, words):
return {w for w in word if w in words}
def suggest(word, words):
candidates = (
known([word], words)
or known(edits1(word), words)
or known_edits2(word, words)
or [word]
)
return candidates
def correct(word, words):
candidates = suggest(word, words)
return max(candidates, key=words.get)
class NorvigSpellCheck:
def __init__(
self,
words='dict/words',
propernames='dict/propernames',
):
data_dirs = resources.xdg_data_dirs()
self.dictwords = resources.find_first(words, data_dirs)
self.propernames = resources.find_first(propernames, data_dirs)
self.words = collections.defaultdict(lambda: 1)
self.extra_words = set()
self.dictionary = None
self.initialized = False
def set_dictionary(self, dictionary):
self.dictionary = dictionary
def init(self):
if self.initialized:
return
self.initialized = True
train(self.read(), self.words)
train(self.extra_words, self.words)
def add_word(self, word):
self.extra_words.add(word)
def suggest(self, word):
self.init()
return suggest(word, self.words)
def check(self, word):
self.init()
return word.replace('.', '') in self.words
def read(self):
"""Read dictionary words"""
paths = []
words = self.dictwords
propernames = self.propernames
cfg_dictionary = self.dictionary
if words and os.path.exists(words):
paths.append((words, True))
if propernames and os.path.exists(propernames):
paths.append((propernames, False))
if cfg_dictionary and os.path.exists(cfg_dictionary):
paths.append((cfg_dictionary, False))
for path, title in paths:
try:
with codecs.open(
path, 'r', encoding='utf-8', errors='ignore'
) as words_file:
for line in words_file:
word = line.rstrip()
yield word
if title:
yield word.title()
except OSError:
pass