-
Notifications
You must be signed in to change notification settings - Fork 1
/
turkish_stemmer.rb
472 lines (393 loc) · 14.4 KB
/
turkish_stemmer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
# coding: utf-8
require "turkish_stemmer/version"
require "yaml"
require "active_support/core_ext/hash"
# Please note that we use only lowercase letters for all methods. One should
# normalize input streams before using the `stem` method.
module TurkishStemmer
extend self
VOWELS = "üiıueöao"
CONSONANTS = "bcçdfgğhjklmnprsştvyz"
ROUNDED_VOWELS = "oöuü"
UNROUNDED_VOWELS = "iıea"
FOLLOWING_ROUNDED_VOWELS = "aeuü"
FRONT_VOWELS = "eiöü"
BACK_VOWELS = "ıuao"
# Heuristic size for average Turkish stemmed word size
AVG_STEMMED_SIZE = 4
# Regular expression that checks if the word contains only turkish characters
ALPHABET = Regexp.new("^[abcçdefgğhıijklmnoöprsştuüvyz]+$").freeze
# Stems a Turkish word.
#
# Algorithm consists of 3 parts: pre-process, process and post-process. The
# pre-process phase is a quick lookup for words that should not be stemmed
# based on length, protected words list and vowel harmony. The process phase
# includes a nominal verb suffix and a noun suffix stripper machine. The last
# phase includes some additional checks and a simple stem selection decision.
#
# @param word [String] the word to stem
# @param depth [Integer] the call stack depth
# @return [String] the stemmed word
def stem(original_word, depth = 0)
# Preprocess
return original_word if !proceed_to_stem?(original_word)
word = original_word.dup
# Process
stems = []
stems << nominal_verbs_suffix_machine { word }
stems << original_word
stems.flatten!.uniq!
stems << stems.map { |word| noun_suffix_machine { word }}
stems << original_word
stems.flatten!.uniq!
if stems.include? word and stems.size < 2 and depth < 1
if word[-1] == 'u' || word[-1] == 'ü' ||
word[-1] == 'i' || word[-1] == 'ı'
if word[-1] == 'ü'
word[-1] = 'u'
elsif word[-1] == 'ı'
word[-1] = 'i'
elsif word[-1] == 'i'
word[-1] = 'ı'
elsif word[-1] == 'u'
word[-1] = 'ü'
end
depth += 1
return stem(word, depth)
end
end
stems << stems.map { |word| derivational_suffix_machine { word }}
# Postprocess
stem_post_process(stems, original_word)
end
# Loads yaml file and symbolizes keys
#
# @param file [String] path to yaml file
# @return [Hash] the hash with symbols as keys
def load_states_or_suffixes(file)
config_path = File.expand_path("../../#{file}", __FILE__)
YAML.load_file(config_path).symbolize_keys
rescue => e
raise "An error occured loading #{file}, #{e}"
end
# Helper method for loading settings
#
# @param key [String] the key
def load_settings(key)
config_path = File.expand_path("../../config/stemmer.yml", __FILE__)
begin
YAML.load_file(config_path)[key]
rescue => e
raise "Please provide a valid config/stemmer.yml file, #{e}"
end
end
NOMINAL_VERB_STATES = load_states_or_suffixes("config/nominal_verb_states.yml")
NOMINAL_VERB_SUFFIXES = load_states_or_suffixes("config/nominal_verb_suffixes.yml")
NOUN_STATES = load_states_or_suffixes("config/noun_states.yml")
NOUN_SUFFIXES = load_states_or_suffixes("config/noun_suffixes.yml")
DERIVATIONAL_STATES = load_states_or_suffixes("config/derivational_states.yml")
DERIVATIONAL_SUFFIXES = load_states_or_suffixes("config/derivational_suffixes.yml")
##
# Load settings
#
# Protected words
PROTECTED_WORDS = load_settings("protected_words")
# Last consonant exceptions
LAST_CONSONANT_EXCEPTIONS = load_settings("last_consonant_exceptions")
# Vower harmony exceptions
VOWEL_HARMONY_EXCEPTIONS = load_settings("vowel_harmony_exceptions")
# Selection list exceptions
SELECTION_LIST_EXCEPTIONS = load_settings("selection_list_exceptions")
# Counts syllables of a Turkish word. In Turkish the number of syllables is
# equals to the number of vowels.
#
# @param word [String] the word to count its syllables
# @return [Fixnum] the number of syllables
def count_syllables(word)
vowels(word).size
end
# Gets the vowels of a word
#
# @param word [String] the word to get its vowels
# @return [Array] array of vowels
def vowels(word)
word.gsub(/#{CONSONANTS.chars.to_a.join('|')}/,"").chars.to_a
end
# Checks vowel harmony of a word according to Turkish vowel harmony.
#
# @param word [String] the word to be checked against Turkish vowel harmony
# @return [Boolean]
# @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
def has_vowel_harmony?(word)
word_vowels = vowels(word)
vowel = word_vowels[-2]
candidate = word_vowels[-1]
vowel_harmony?(vowel, candidate)
end
# Checks vowel harmony between two vowels
#
# @param vowel [String] the first vowel
# @param candidate [String] the second vowel
# @return [Boolean]
# @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
def vowel_harmony?(vowel, candidate)
has_roundness?(vowel, candidate) && has_frontness?(vowel, candidate)
end
# Checks roundness vowel harmony of two vowels according to Turkish vowel
# harmony.
#
# @param vowel [String] the first vowel
# @param candidate [String] the second vowel
# @return [Boolean]
# @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
def has_roundness?(vowel, candidate)
return true if vowel.nil? || vowel.empty?
return true if candidate.nil? || candidate.empty?
if (UNROUNDED_VOWELS.include?(vowel) && UNROUNDED_VOWELS.include?(candidate)) ||
(ROUNDED_VOWELS.include?(vowel) && FOLLOWING_ROUNDED_VOWELS.include?(candidate))
return true
end
false
end
# Checks frontness vowel harmony of two vowels according to Turkish vowel
# harmony.
#
# @param vowel [String] the first vowel
# @param candidate [String] the second vowel
# @return [Boolean]
# @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
def has_frontness?(vowel, candidate)
return true if vowel.nil? || vowel.empty?
return true if candidate.nil? || candidate.empty?
if (FRONT_VOWELS.include?(vowel) && FRONT_VOWELS.include?(candidate)) ||
(BACK_VOWELS.include?(vowel) && BACK_VOWELS.include?(candidate))
return true
end
false
end
# Checks whether a word can be stemmed or not. This method checks candidate
# word against nil, protected, length and vowel harmory.
#
# @param word [String] the candidate word for stemming
# @return [Boolean] whether should proceed to stem or not
def proceed_to_stem?(word)
if word.nil? || !turkish?(word) ||
PROTECTED_WORDS.include?(word) ||
count_syllables(word) <= 1
return false
end
true
end
# Post stemming process
#
# @param stems [Array] array of candidate stems
# @param original_word [String] the original word
# @return [String] the stemmed or the original word
def stem_post_process(stems, original_word)
if ENV['DEBUG']
puts "post process for #{original_word}: #{stems}"
end
stems = stems.flatten.uniq
# Reject original word
stems.reject! { |w| w == original_word }
# Reject all non-syllable words
stems.reject! { |w| count_syllables(w) == 0 }
# Transform last consonant
stems.map! { |word| last_consonant!(word) }
# Sort stems by size
stems.sort! do |x,y|
if (x.size - AVG_STEMMED_SIZE).abs == (y.size - AVG_STEMMED_SIZE).abs
x.size <=> y.size
else
(x.size - AVG_STEMMED_SIZE).abs <=> (y.size - AVG_STEMMED_SIZE).abs
end
end
# Check selection list exceptions
if !(exception = (stems & SELECTION_LIST_EXCEPTIONS)).empty?
return exception.first
end
# Keep first or original word
stems.empty? ? original_word : stems.first
end
# Given a state key and a word, scans through given states and generate valid
# pending transitions.
#
# @param key [String] the key for states hash
# @param word [String] the word to check
# @param states [Hash] the states hash
# @param suffixes [Hash] the suffixes hash
# @param options [Hash] options for pendings
# @option options [Boolean] :mark Whether this pending is marked for deletion
# @return [Array] array of pendings
def generate_pendings(key, word, states, suffixes, options = {})
raise ArgumentError, "State #{key} does not exist" if (state = states[key]).nil?
mark = options[:mark] || false
matched_transitions = state["transitions"].select do |transition|
word.match(/(#{suffixes[transition["suffix"]]["regex"]})$/)
end
matched_transitions.map do |transition|
{
suffix: transition["suffix"],
to_state: transition["state"],
from_state: key,
word: word,
mark: mark
}
end
end
# Given a suffix it stems a word according to Turkish orthographic rules
#
# @param word [String] the word to stem
# @param suffix [Hash] a suffix record
# @return [Hash] a stem answer record
def mark_stem(word, suffix)
stem = !PROTECTED_WORDS.include?(word) &&
(suffix["check_harmony"] &&
(has_vowel_harmony?(word) || VOWEL_HARMONY_EXCEPTIONS.include?(word))) ||
!suffix["check_harmony"]
suffix_applied = suffix["regex"]
if stem && (match = word.match(/(#{suffix_applied})$/))
new_word = word.gsub(/(#{match.to_s})$/, '')
suffix_applied = match.to_s
if suffix["optional_letter"]
answer, match = valid_optional_letter?(new_word, suffix["optional_letter"])
if answer && match
new_word = new_word.chop
suffix_applied = match + suffix_applied
elsif !answer
new_word = word
suffix_applied = nil
stem = false
end
end
else
stem = false
suffix_applied = nil
new_word = word
end
{ stem: stem, word: new_word, suffix_applied: suffix_applied }
end
# Given a word and a letter it checks if the optional letter can be part of
# the stem or not.
#
# @param word [String] the examined word
# @param letter [String] a single letter or a string armed with a regular
# expression
# @return [Array] the answer is returned as an array. First element is a
# Boolean value and second element is the mached character.
# @example
# self.valid_optional_letter?("test", "t")
# # => [true, 't']
def valid_optional_letter?(word, letter)
match = word.match(/(#{letter})$/)
answer = true
matched_char = nil
if match
matched_char = match.to_s
previous_char = word[-2]
answer = if VOWELS.include?(matched_char)
(previous_char && CONSONANTS.include?(previous_char))
else
(previous_char && VOWELS.include?(previous_char))
end
end
[answer, matched_char]
end
# Transforms a word taken into account last consonant rule.
#
# @param word [String] the word to check for last consonant change
# @return [String] the changed word
def last_consonant!(word)
return word if LAST_CONSONANT_EXCEPTIONS.include?(word)
consonants = { 'b' => 'p', 'c' => 'ç', 'd' => 't', 'ğ' => 'k' }
last_char = word[-1]
if consonants.keys.include?(last_char)
word[-1] = consonants[last_char]
end
word
end
# Helper method. This is just a shortcut.
def nominal_verbs_suffix_machine
affix_morphological_stripper(yield, states: self::NOMINAL_VERB_STATES,
suffixes: self::NOMINAL_VERB_SUFFIXES)
end
# Helper method. This is just a shortcut.
def noun_suffix_machine
affix_morphological_stripper(yield, states: self::NOUN_STATES,
suffixes: self::NOUN_SUFFIXES)
end
# Helper method
def derivational_suffix_machine
affix_morphological_stripper(yield, states: self::DERIVATIONAL_STATES,
suffixes: self::DERIVATIONAL_SUFFIXES)
end
# A simple algorithm to strip suffixes from a word based on states and
# transitions.
#
# @param word [String] the word to strip affixes from
# @param options [Hash] options for the algorithm
# @option options [Hash] :states The states and valid transitions
# @option options [Hash] :suffixes The suffixes with their rules
# @return [Array] all possible stem versions
def affix_morphological_stripper(word, options = {})
states = options[:states] || {}
suffixes = options[:suffixes] || {}
return [word] if states.nil? || states.empty?
return [word] if suffixes.nil? || suffixes.empty?
stems = []
# Init first state pending transitions
pendings = generate_pendings(:a, word, states, suffixes)
while !pendings.empty? do
transition = pendings.shift
word = transition[:word]
suffix = suffixes[transition[:suffix]]
to_state = states[transition[:to_state]]
answer = mark_stem(word, suffix)
if answer[:stem] == true
if ENV['DEBUG']
puts "Word: #{word} \nAnswer: #{answer} \nInfo: #{transition} \nSuffix: #{suffix}"
end
if to_state["final_state"] == true
# We have a valid transition here. It is safe to remove any pendings
# with the same signature current pending
remove_pendings_like!(transition, pendings)
remove_mark_pendings!(pendings)
stems.push answer[:word]
unless to_state["transitions"].empty?
pendings.unshift(*generate_pendings(transition[:to_state], answer[:word], states, suffixes))
end
else
mark_pendings!(transition, pendings)
pendings.unshift(*generate_pendings(transition[:to_state], answer[:word],
states, suffixes, mark: true))
end
end
end
return [word] if pendings.empty? && stems.empty?
stems.uniq
end
private
def remove_pendings_like!(pending, array)
array.reject! do |candidate|
candidate[:to_state] == pending[:to_state] &&
candidate[:from_state] == pending[:from_state]
end
end
def mark_pendings!(pending, array)
similar_pendings(pending, array).each do |candidate|
candidate[:mark] = true
end
end
def remove_mark_pendings!(array)
array.reject! { |candidate| candidate[:mark] == true }
end
def similar_pendings(pending, array)
array.select do |candidate|
candidate[:to_state] == pending[:to_state] &&
candidate[:from_state] == pending[:from_state]
end
end
def turkish?(word)
!! word.match(ALPHABET)
end
end