Create not_in_dict.py

okdiplodok · May 13, 2015 · c0bc17f · c0bc17f
1 parent 63b66a0
commit c0bc17f
Showing 1 changed file with 65 additions and 0 deletions.
diff --git a/not_in_dict.py b/not_in_dict.py
@@ -0,0 +1,65 @@
+#coding:utf-8
+import codecs
+import re
+
+#на вход - файл и частотный словарь Ляшевской и Шарова 
+#(электронная версия доступна по ссылке http://dict.ruslang.ru/freq.php). На выходе - файл формата
+#"лемма ipm вхождение_из_текста". Поиск слов, который НЕТ в словаре (с вычетом имен собственных и глаголов)
+my_d = {}
+g = codecs.open(u'freqrnc2011.csv', 'r', 'utf-8')
+for line in g:
+    line = line.strip()
+    line = line.split(u'\t')
+    my_d[line[0]] = line[2]
+
+check = set()
+def rare_words(nfile):
+    excl = set([u'ть', u'чь', u'ти', u'ся', u'ие', u'ок', u'ек'])
+    global check
+    global my_d
+    r = re.compile(u'([а-яА-Я]+){([а-я]+)}', flags=re.U)
+    s = codecs.open(nfile, 'r', 'utf-8-sig')
+    for line in s:
+        line = line.split()
+        for element in line:
+            m = r.search(element)
+            if m != None:
+                if m.group(2) not in my_d and m.group(2) not in check and m.group(2)[-2:] not in excl and m.group(1)[0] not in u'ЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ' and len(m.group(2)) > 2:
+                    check.add(m.group(2))
+                    #return m.group(1)
+                    return m.group(2) + u'\t' + u'not_in_d'  + u'\t' + m.group(1) + u'\r\n'
+    s.close()
+
+#ф-я rare_write() на каждом шаге цикла читает размеченный файл и вызывает ф-ю rare_words()
+def rare_write():
+    n = 1
+    new = codecs.open(u'haha.csv', 'a', 'utf-8')
+    while n <= 361:
+        filen = u'C:\\Users\\diplodok\\Desktop\\Fipl\\War_and_Peace\\mystem\\a' + str(n) + u'.txt'
+        el = rare_words(filen)
+        if el != None:
+            new.write(el)
+        n += 1
+    new.close()
+
+#ss = rare_write()
+
+#ф-я собирает определения из файла комментариев Соболева. На выходе - файл формата 'lemma  ipm  token  def'
+def def_collect():         
+    y = codecs.open('haha.csv', 'r', 'utf-8')
+    g = codecs.open(u'def.txt', 'r', 'utf-8').readlines()
+    f = codecs.open(u'haha_def.csv', 'a', 'utf-8')
+    for line in y:
+        res = line.strip()
+        word = line.split('\t')[0]
+        #ipm = line.split(';')[1]
+        for definition in g:
+            n = definition.split(' -')[0]
+            if n.startswith(word) and len(word) > 2:
+                line1 = res + u'\t' + definition + u'\r\n'
+                print line1
+                f.write(line1)
+    y.close()
+    f.close()
+
+#sss = def_collect()