diff --git a/macropodus/__init__.py b/macropodus/__init__.py index db158ea..a085302 100644 --- a/macropodus/__init__.py +++ b/macropodus/__init__.py @@ -10,9 +10,9 @@ from macropodus.segment import cut_bidirectional, cut_forward, cut_reverse, cut_search, cut_dag, cut, find from macropodus.segment import load_user_dict, save_delete_words, save_add_words, delete_word, add_word from macropodus.summarize import keyword, textrank, summarization -from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects from macropodus.version import __version__ # 版本 from macropodus.similarity import sim +import os # 机械分词 cut_bidirectional = cut_bidirectional @@ -49,3 +49,6 @@ han2zh = han2zh zh2han = zh2han pinyin = pinyin + +if os.environ.get("macropodus_use_dl", False)=="1": + from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects diff --git a/macropodus/segment/__init__.py b/macropodus/segment/__init__.py index 08f8064..7784ea9 100644 --- a/macropodus/segment/__init__.py +++ b/macropodus/segment/__init__.py @@ -7,9 +7,13 @@ from macropodus.segment.seg_statistics.seg_statistics import SegStatistics from macropodus.segment.word_discovery.word_discovery import WordDiscovery +import os -# 机械分词 -use_cache = True # 使用缓存 + +# 机械分词,默认使用缓存 +use_cache = True +if not os.environ.get("macropodus_use_seg_cache", True): + use_cache = False # 不使用缓存,重新加载 segs = SegStatistics(use_cache) cut_bidirectional = segs.cut_bidirectional cut_forward = segs.cut_forward diff --git a/macropodus/segment/seg_statistics/seg_statistics.py b/macropodus/segment/seg_statistics/seg_statistics.py index 4c20714..597550e 100644 --- a/macropodus/segment/seg_statistics/seg_statistics.py +++ b/macropodus/segment/seg_statistics/seg_statistics.py @@ -8,7 +8,7 @@ from macropodus.preprocess.tools_common import re_continue from macropodus.base.seg_basic import SegBasic from math import log - +import re __all__ = ["cut_dag", "cut_forward", @@ -16,6 +16,9 @@ "cut_bidirectional", "cut_search"] +re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) +re_skip = re.compile("(\r\n|\s)", re.U) + class SegStatistics(SegBasic): def __init__(self, use_cache): @@ -170,22 +173,42 @@ def cut_search(self, sentence): def cut(self, sentence, type_cut="cut_dag"): """ 切词总函数 + cut_block, 代码来自jieba项目 + code from: https://github.com/fxsjy/jieba :param sentence:str, like '大漠帝国, macropodus, 中国斗鱼' :param type_cut: str, like 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search' - :return: list, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼'] + :return: yield, like ['大漠帝国', ',', 'macropodus', ',', '中国斗鱼'] """ if type_cut=="cut_dag": - return list(self.cut_dag(sentence)) + cut_block = self.cut_dag elif type_cut=="cut_forward": - return list(self.cut_dag(sentence)) + cut_block = self.cut_forward elif type_cut=="cut_reverse": - return list(self.cut_dag(sentence)) + cut_block = self.cut_reverse elif type_cut=="cut_bidirectional": - return list(self.cut_dag(sentence)) + cut_block = self.cut_bidirectional elif type_cut=="cut_search": - return list(self.cut_dag(sentence)) + cut_block = self.cut_search else: raise RuntimeError("type_cut must be 'cut_dag', 'cut_forward', 'cut_reverse', 'cut_bidirectional', 'cut_search'") + blocks = re_han.split(sentence) + cut_all = False + for block in blocks: + if not block: + continue + if re_han.match(block): + for word in cut_block(block): + yield word + else: + tmp = re_skip.split(block) + for x in tmp: + if re_skip.match(x): + yield x + elif not cut_all: + for xx in x: + yield xx + else: + yield x if __name__ == '__main__': diff --git a/macropodus/segment/word_discovery/word_discovery.py b/macropodus/segment/word_discovery/word_discovery.py index 4627e77..e8feb22 100644 --- a/macropodus/segment/word_discovery/word_discovery.py +++ b/macropodus/segment/word_discovery/word_discovery.py @@ -25,6 +25,7 @@ def __init__(self): self.total_words = 0 self.freq_min = 3 self.len_max = 7 + self.round = 6 self.eps = 1e-9 self.empty_words = [sw for sw in stop_words.values() if len(sw)==1] # 虚词 @@ -35,8 +36,10 @@ def count_word(self, text, use_type="text"): :param use_type: str, "text" or "file", file of "utf-8" of "txt" :return: class, word-freq """ + import macropodus self.words_count = Counter() if use_type=="text": # 输入为文本形式 + text = macropodus.han2zh(text) texts = cut_sentence(use_type=self.algorithm, text=text) # 切句子, 如中英文的逗号/句号/感叹号 for text in texts: @@ -50,6 +53,7 @@ def count_word(self, text, use_type="text"): fr8 = open(text, "r", encoding="utf-8") for text in fr8: if text.strip(): + text = macropodus.han2zh(text) texts = cut_sentence(use_type=self.algorithm, text=text) # 切句子, 如中英文的逗号/句号/感叹号 for text in texts: @@ -108,9 +112,9 @@ def calculate_entropy(self, boundary_type="left"): if (k[0] in self.empty_words or k[-1] in self.empty_words): entroy_boundary = entroy_boundary / len(k) if boundary_type == "right": - self.right_entropy[k] = entroy_boundary + self.right_entropy[k] = round(entroy_boundary, self.round) else: - self.left_entropy[k] = entroy_boundary + self.left_entropy[k] = round(entroy_boundary, self.round) def compute_entropys(self): """ @@ -146,8 +150,38 @@ def compute_aggregation(self): probability_chars = reduce(mul,([wf for wf in words_freq])) / (twl_1**(len(word))) pmi = math.log(probability_word / probability_chars, 2) # AMI=PMI/length_word. 惩罚虚词(避免"的", "得", "了"开头结尾的情况) - self.aggregation[word] = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \ - else pmi/len_word # pmi / len_word / len_word + word_aggregation = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \ + else pmi/len_word # pmi / len_word / len_word + self.aggregation[word] = round(word_aggregation, self.round) + + def compute_score(self, word, value, a, r, l, rl, lambda_0, lambda_3): + """ + 计算最终得分 + :param word: str, word with prepare + :param value: float, word freq + :param a: float, aggregation of word + :param r: float, right entropy of word + :param l: float, left entropy of word + :param rl: float, right_entropy * left_entropy + :param lambda_0: lambda 0 + :param lambda_3: lambda 3 + :return: + """ + self.new_words[word] = {} + # math.log10(self.aggregation[word]) - math.log10(self.total_words) + self.new_words[word]["a"] = a + self.new_words[word]["r"] = r + self.new_words[word]["l"] = l + self.new_words[word]["f"] = value + # word-liberalization + m1 = lambda_0(r) + m2 = lambda_0(l) + m3 = lambda_0(a) + score_ns = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3) + self.new_words[word]["ns"] = round(score_ns, self.round) + # 乘以词频word-freq, 连乘是为了防止出现较小项 + score_s = value * a * rl * score_ns + self.new_words[word]["s"] = round(score_s, self.round) def find_word(self, text, use_type="text", freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2, use_output=True, use_avg=False, use_filter=False): @@ -175,66 +209,25 @@ def find_word(self, text, use_type="text", freq_min=2, len_max=5, entropy_min=2. lambda_0 = lambda x: -self.eps * x + self.eps if x <= 0 else x # 输出 for word, value in self.words_select.items(): + # 过滤通用词 if use_filter and word in self.dict_words_freq: continue + # 过滤停用词 if word in self.stop_words: continue - if use_output: - # {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"} - self.new_words[word] = {} - # math.log10(self.aggregation[word]) - math.log10(self.total_words) - self.new_words[word]["a"] = self.aggregation[word] - self.new_words[word]["r"] = self.right_entropy[word] - self.new_words[word]["l"] = self.left_entropy[word] - self.new_words[word]["f"] = value - # word-liberalization - m1 = lambda_0(self.right_entropy[word]) - m2 = lambda_0(self.left_entropy[word]) - m3 = lambda_0(self.aggregation[word]) - score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3) - self.new_words[word]["ns"] = score_3 - # 乘以freq效果没那么好, 连乘是为了防止出现较小项 - # self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \ - # self.right_entropy[word] * self.left_entropy[word] - self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \ - self.right_entropy[word] * self.left_entropy[word] * score_3 + # {"aggregation":"a", "right_entropy":"r", "left_entropy":"l", "frequency":"f", + # "word-liberalization":"ns", "score":"s"} + a = self.aggregation[word] + r = self.right_entropy[word] + l = self.left_entropy[word] + rl = (r+l) / 2 if use_avg else r * l + if use_output or (use_avg and a > self.aggregation_min and rl > self.entropy_min) or \ + (not use_avg and a > self.aggregation_min and r > self.entropy_min and l > self.entropy_min): + self.compute_score(word, value, a, r, l, rl, lambda_0, lambda_3) - elif not use_avg and self.aggregation[word] > self.aggregation_min \ - and self.right_entropy[word] > self.entropy_min and self.left_entropy[word] > self.entropy_min: - self.new_words[word] = {} - # {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"} - self.new_words[word]["a"] = self.aggregation[word] # math.log10(self.aggregation[word]) - math.log10(self.total_words) - self.new_words[word]["r"] = self.right_entropy[word] - self.new_words[word]["l"] = self.left_entropy[word] - self.new_words[word]["f"] = value - # word-liberalization - m1 = lambda_0(self.right_entropy[word]) - m2 = lambda_0(self.left_entropy[word]) - m3 = lambda_0(self.aggregation[word]) - score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3) - self.new_words[word]["ns"] = score_3 - self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \ - (self.right_entropy[word] + self.left_entropy[word])/2 * score_3 - elif use_avg and self.aggregation[word] > self.aggregation_min \ - and (self.right_entropy[word] + self.left_entropy[word]) > 2 * self.entropy_min: - self.new_words[word] = {} - # {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"} - self.new_words[word]["a"] = self.aggregation[word] - self.new_words[word]["r"] = self.right_entropy[word] - self.new_words[word]["l"] = self.left_entropy[word] - self.new_words[word]["f"] = value - # word-liberalization - m1 = lambda_0(self.right_entropy[word]) - m2 = lambda_0(self.left_entropy[word]) - m3 = lambda_0(self.aggregation[word]) - score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3) - self.new_words[word]["ns"] = score_3 - self.new_words[word]["s"] = self.new_words[word]["a"] * (self.right_entropy[word] + self.left_entropy[word])/2 - # mul, 相乘 - self.new_words[word]["s"] *= score_3 # 排序 - new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True) - self.new_words = OrderedDict(new_words) + self.new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True) + self.new_words = OrderedDict(self.new_words) return self.new_words diff --git a/macropodus/similarity/__init__.py b/macropodus/similarity/__init__.py index 17042a2..32e4e12 100644 --- a/macropodus/similarity/__init__.py +++ b/macropodus/similarity/__init__.py @@ -6,9 +6,12 @@ from macropodus.similarity.similarity_word2vec_char import SimW2vChar +import os - +# 词向量, 默认使用缓存 +use_cache = True +if not os.environ.get("macropodus_use_w2v_cache", True): + use_cache = False # 不使用缓存,重新加载 # 文本相似度 -use_cache = True # 使用缓存 swc = SimW2vChar(use_cache) sim = swc.similarity diff --git a/macropodus/similarity/similarity_word2vec_char.py b/macropodus/similarity/similarity_word2vec_char.py index 8b095e7..d5ce447 100644 --- a/macropodus/similarity/similarity_word2vec_char.py +++ b/macropodus/similarity/similarity_word2vec_char.py @@ -9,7 +9,7 @@ class SimW2vChar(W2v): - def __init__(self, use_cache): + def __init__(self, use_cache=True): super().__init__(use_cache) def encode(self, sent, type_encode="other"): diff --git a/macropodus/summarize/graph_base/textrank.py b/macropodus/summarize/graph_base/textrank.py index 544ccc8..201f98d 100644 --- a/macropodus/summarize/graph_base/textrank.py +++ b/macropodus/summarize/graph_base/textrank.py @@ -8,8 +8,12 @@ from macropodus.summarize.graph_base.textrank_word2vec import TextrankWord2vec from macropodus.summarize.graph_base.textrank_gensim import TextrankGensimSum from macropodus.summarize.graph_base.textrank_sklearn import TextrankSklearn +import os - +# 词向量, 默认使用缓存 +use_cache = True +if not os.environ.get("macropodus_use_w2v_cache", True): + use_cache = False # 不使用缓存,重新加载 # textrank of gensim trgs = TextrankGensimSum() # textrank of word2vec diff --git a/test/survey_report/nlp_platfom_survey.md b/test/survey_report/nlp_platfom_survey.md index 8051bfc..1ca7623 100644 --- a/test/survey_report/nlp_platfom_survey.md +++ b/test/survey_report/nlp_platfom_survey.md @@ -89,3 +89,17 @@ PaddleNLP|c++|3.4k|6/1/!|是|是|是|是|是|是|是|是|是|是|Apache-2.0 * ik-analyzer:[https://github.com/wks/ik-analyzer](https://github.com/wks/ik-analyzer) * fnlp:[https://github.com/FudanNLP/fnlp](https://github.com/FudanNLP/fnlp) * NLPIR:[https://github.com/NLPIR-team/NLPIR](https://github.com/NLPIR-team/NLPIR) + +### +新词发现: +1. Matrix67: The Aha Moments的信息熵方法: [互联网时代的社会语言学:基于SNS的文本数据挖掘](http://www.matrix67.com/blog/archives/5044) + 1.词频、左右熵(丰度,字符组合左右邻字的丰富程度, -p*log(p))、 + 2.互信息(凝固度,内部凝聚程度, pmi = p(x,y)*log(p(x,y)/(p(x)*p(y))))等构建得分函数 +2. HanLP的长短语构造方法: [基于互信息和左右信息熵的短语提取识别](https://www.hankcs.com/nlp/extraction-and-identification-of-mutual-information-about-the-phrase-based-on-information-entropy.html) + 1.切词(只统计词典),统计词语共现(一阶、二阶、三阶) + 2.左右熵、互信息。合并词典词语,构建短语 +3. SmoothNLP:["新词发现"算法探讨与优化-SmoothNLP](https://zhuanlan.zhihu.com/p/80385615) + 1.左右熵权重: Ew =log((El*e^Er+Er*e^EL)/|Er-El|) + 2.平均互信息AMI:(1/n) * log(p(w)/(p(1)p(2)...p(n))) + 3.过滤条件:对在candidate ngram中, 首字或者尾字出现次数特别多的进行筛选, 如"XX的,美丽的,漂亮的"剔出字典 +