generated from BloomTech-Labs/template-ds
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathretrieve_definition.py
157 lines (115 loc) · 4.37 KB
/
retrieve_definition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/python3
"""
Pull first 300 characters from Wikipedia article for a given term
"""
import requests
def get_API_params(term):
params = {
"action": "query",
"prop": "extracts",
"exchars": "190",
"titles": term,
"format": "json",
"explaintext": 1,
"exlimit": 1
}
# parameters set to query for an extract of 300 characters for the given term, in JSON format. Explaintext strips
# out Wikipedia's special formatting. Exlimit says to only return 1
# extract.
return params
def get_opensearch_params(term):
params = {
"action": "opensearch",
"search": term,
"redirects": "resolve",
"format": "json"
}
# Parameters set tells API to use opensearch on the given term and return the results as a JSON object.
# Resolve means to return redirects as the page they point to.
return params
def get_json_extract(term):
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
params = get_API_params(term)
print("Searching API for: ", term)
response = S.get(url=URL, params=params)
S.close()
data = response.json()
return data
def get_json_opensearch(term):
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
params = get_opensearch_params(term)
R = S.get(url=URL, params=params)
DATA = R.json()
suggests = DATA[1]
S.close()
return suggests
def retrieve_definition(term, term_wrangled=False):
"""
Given a term, returns the first 190 characters of the matching Wikipedia
page. If no term is found, returns a "Did you mean...?" prompt with three
terms that do have matching pages.
"""
if len(term) > 255:
text = 'Sorry, that text is too long to search!'
data = get_json_extract(term)
pageid = list(data['query']['pages'].keys())[0]
try:
print("Pulling extract")
extract = data['query']['pages'][pageid]['extract']
# this selects the extract from within the JSON object returned by the API call. Two steps are necessary
# because one of the dictionary keys is the page ID for that term.
# if the length of extract is 3, that indicates extract is '...',
# which is what the API usually returns if it doesn't find a page
if len(extract) > 3:
text = extract
elif len(extract) == 3 and term_wrangled is False:
wrangled_term = text_wrangle(term)
print("Wrangled_term: ", wrangled_term)
wrangled_extract = retrieve_definition(wrangled_term,
term_wrangled=True)
print(len(wrangled_extract))
if len(wrangled_extract) > 3:
text = wrangled_extract
else:
text = open_search(term)
else:
text = open_search(term)
except KeyError:
# sometimes instead of an empty string as an extract the API call returns a "missing" key in JSON, this accounts
# for that
text = open_search(term)
return text
def open_search(term):
"""
function to use opensearch on Wikipedia API and return most likely related articles for a given term. opensearch
is a Wikimedia API feature which returns similarly-titled articles within the wiki.
"""
suggests = get_json_opensearch(term)
try:
return f"Did you mean {suggests[0]}, {suggests[1]}, {suggests[2]}?"
except IndexError:
# This covers cases where input doesn't have a close Wiki entry
return "We can't find anything close to that :("
def text_wrangle(term):
"""
Check text for various edge cases and remove
"""
import inflect
# Start engine for text_wrangle() singularization
p = inflect.engine()
# Makes term lowercase
term = term.lower()
print("Lowercase search: ", term)
if term[0:4] == 'the ':
# Strips 'the' and 'The' from term
term = term[4:]
print("Search without 'the': ", term)
if term[0:2] == 'a ':
term = term[2:]
print("Search without 'a': ", term)
if p.singular_noun(term):
term = p.singular_noun(term)
print("Search as singular: ", term)
return term