Skip to content

Commit

Permalink
fix find(newword)
Browse files Browse the repository at this point in the history
  • Loading branch information
yongzhuo committed Jun 3, 2020
1 parent 23ae5a7 commit 9de38c0
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 65 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ summary = "PageRank算法简介。" \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "

# 新词发现(findword, 默认接口)
sents = macropodus.find(text=summary, freq_min=2, len_max=7, entropy_min=1.2, aggregation_min=0.5, use_avg=True)
sents = macropodus.find(text=summary, use_type="text", use_avg=False, use_filter=False, use_output=True, freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2)
print(sents)

```
Expand Down
4 changes: 2 additions & 2 deletions macropodus/preprocess/tools_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@ def txt_write(list_line, file_path, type='w', encode_type='utf-8'):
logger.info(str(e))


def save_json(json_lines, json_path, encoding='utf-8'):
def save_json(json_lines, json_path, encoding='utf-8', indent=4):
"""
保存json,
:param json_lines: json
:param path: str
:return: None
"""
with open(json_path, 'w', encoding=encoding) as fj:
fj.write(json.dumps(json_lines, ensure_ascii=False))
fj.write(json.dumps(json_lines, ensure_ascii=False, indent=indent))
fj.close()


Expand Down
183 changes: 121 additions & 62 deletions macropodus/segment/word_discovery/word_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,28 @@
# @function: chinese word discovery


from macropodus.data.words_common.stop_words import stop_words
from macropodus.preprocess.tools_ml import cut_sentence
from macropodus.preprocess.tools_ml import get_ngrams
from collections import Counter
from collections import Counter, OrderedDict
from functools import reduce
from operator import mul
import math
import os


class WordDiscovery:
def __init__(self):
from macropodus.segment import segs
self.dict_words_freq = segs.dict_words_freq
self.algorithm = "new-word-discovery"
self.stop_words = stop_words
self.total_words_len = {}
self.total_words = 0
self.freq_min = 3
self.len_max = 7
self.eps = 1e-9
self.empty_words = [sw for sw in stop_words.values() if len(sw)==1] # 虚词

def count_word(self, text, use_type="text"):
"""
Expand Down Expand Up @@ -61,39 +70,51 @@ def calculate_entropy(self, boundary_type="left"):
"""
# 获取成词的最左边和最右边的一个字
one_collect = {}
self.total_words_len = {}
for k, v in self.words_count.items():
len_k = len(k)
if len_k >= 3: # 词长度大于3
if len_k >= 2: # 词长度大于3
if boundary_type == "right":
k_boundary = k[:-1]
else:
k_boundary = k[1:]
if k_boundary in self.words_select: # 左右边, 保存为dict
# 左右边, 保存为dict, 左右丰度
if k_boundary in self.words_count:
if k_boundary not in one_collect:
one_collect[k_boundary] = [v]
else:
one_collect[k_boundary] = one_collect[k_boundary] + [v]
# 计算n-gram的长度
if len_k not in self.total_words_len:
self.total_words_len[len_k] = [v]
else:
self.total_words_len[len_k] += [v]
self.total_words_len = dict([(k, sum(v)) for k,v in self.total_words_len.items()])

# 计算成词的互信息
# 计算左右熵
for k, v in self.words_select.items():
# 从字典获取
boundary_v = one_collect.get(k, None)
# 计算候选词的左右凝固度, 取最小的那个
if boundary_v:
sum_boundary = sum(boundary_v) # 求和
# 求和
sum_boundary = sum(boundary_v)
# 计算信息熵
entroy_boundary = sum([-(enum_bo / sum_boundary) * math.log(enum_bo / sum_boundary)
entroy_boundary = sum([-(enum_bo / sum_boundary) * math.log(enum_bo / sum_boundary, 2)
for enum_bo in boundary_v])
else:
entroy_boundary = 0.0
# 惩罚虚词开头或者结尾
if (k[0] in self.empty_words or k[-1] in self.empty_words):
entroy_boundary = entroy_boundary / len(k)
if boundary_type == "right":
self.right_entropy[k] = entroy_boundary
else:
self.left_entropy[k] = entroy_boundary

def compute_entropys(self):
"""
计算凝固度
计算左右熵
:param words_count:dict, like {"我":32, "你们":12}
:param len_max: int, like 6
:param freq_min: int, like 32
Expand All @@ -109,37 +130,37 @@ def compute_entropys(self):
self.left_entropy = {}
self.calculate_entropy(boundary_type="left")
self.calculate_entropy(boundary_type="right")
# self.words_count.clear() # 清除变量

def compute_aggregation(self):
"""
计算凝固度
:return: None
"""
twl_1 = self.total_words_len[1] # ngram=1 的所有词频
self.aggregation = {}
for word, value in self.words_select.items():
len_word = len(word)
score_aggs = []
for i in range(1, len_word): # 候选词的左右两边各取一个字
word_right = word[i:]
word_left = word[:i]
value_right = self.words_select.get(word_right, self.freq_min)
value_left = self.words_select.get(word_left, self.freq_min)
# score_agg_single = math.log(value) - math.log(value_right * value_left)
score_agg_single = value / (value_right * value_left)
# score_agg_single = math.log10(value) - math.log10(self.total_words) -math.log10((value_right * value_left))
score_aggs.append(score_agg_single)
self.aggregation[word] = min(score_aggs)

def find_word(self, text, use_type="text", freq_min=2, len_max=7, entropy_min=1.2, aggregation_min=0.5, use_avg=False):
twl_n = self.total_words_len[len_word] # ngram=n 的所有词频
words_freq = [self.words_count.get(wd, 1) for wd in word]
probability_word = value / twl_n
probability_chars = reduce(mul,([wf for wf in words_freq])) / (twl_1**(len(word)))
pmi = math.log(probability_word / probability_chars, 2)
# AMI=PMI/length_word. 惩罚虚词(避免"的", "得", "了"开头结尾的情况)
self.aggregation[word] = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
else pmi/len_word # pmi / len_word / len_word

def find_word(self, text, use_type="text", freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2,
use_output=True, use_avg=False, use_filter=False):
"""
新词发现与策略
:param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt"
:param use_type: str, "text" or "file", file of "utf-8" of "txt"
:param use_type: str, 输入格式, 即文件输入还是文本输入, "text" or "file", file of "utf-8" of "txt"
:param use_output: bool, 输出模式, 即最后结果是否全部输出
:param use_filter: bool, 新词过滤, 即是否过滤macropodus词典和停用词
:param freq_min: int, 最小词频, 大于1
:param len_max: int, 最大成词长度, 一般为5, 6, 7
:param entropy_min: int, 最小词频, 大于1
:param aggregation_min: int, 最大成词长度, 一般为5, 6, 7
:param entropy_min: int, 左右熵阈值, 低于则过滤
:param aggregation_min: int, PMI(凝固度)-阈值, 低于则过滤
:return:
"""
self.aggregation_min = aggregation_min
Expand All @@ -150,68 +171,106 @@ def find_word(self, text, use_type="text", freq_min=2, len_max=7, entropy_min=1.
self.compute_entropys()
self.compute_aggregation()
self.new_words = {}
lambda_3 = lambda m1, m2: math.log((m1 * math.e ** m2 + m2 * math.e ** m1 + self.eps) / (abs(m1 - m2) + 1), 10)
lambda_0 = lambda x: -self.eps * x + self.eps if x <= 0 else x
# 输出
for word,value in self.words_select.items():
if not use_avg and self.aggregation[word] > self.aggregation_min \
for word, value in self.words_select.items():
if use_filter and word in self.dict_words_freq:
continue
if word in self.stop_words:
continue
if use_output:
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
self.new_words[word] = {}
# math.log10(self.aggregation[word]) - math.log10(self.total_words)
self.new_words[word]["a"] = self.aggregation[word]
self.new_words[word]["r"] = self.right_entropy[word]
self.new_words[word]["l"] = self.left_entropy[word]
self.new_words[word]["f"] = value
# word-liberalization
m1 = lambda_0(self.right_entropy[word])
m2 = lambda_0(self.left_entropy[word])
m3 = lambda_0(self.aggregation[word])
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
self.new_words[word]["ns"] = score_3
# 乘以freq效果没那么好, 连乘是为了防止出现较小项
# self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
# self.right_entropy[word] * self.left_entropy[word]
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
self.right_entropy[word] * self.left_entropy[word] * score_3

elif not use_avg and self.aggregation[word] > self.aggregation_min \
and self.right_entropy[word] > self.entropy_min and self.left_entropy[word] > self.entropy_min:
self.new_words[word] = {}
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
self.new_words[word]["a"] = self.aggregation[word] # math.log10(self.aggregation[word]) - math.log10(self.total_words)
self.new_words[word]["r"] = self.right_entropy[word]
self.new_words[word]["l"] = self.left_entropy[word]
self.new_words[word]["f"] = value / self.total_words
self.new_words[word]["f"] = value
# word-liberalization
m1 = lambda_0(self.right_entropy[word])
m2 = lambda_0(self.left_entropy[word])
m3 = lambda_0(self.aggregation[word])
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
self.new_words[word]["ns"] = score_3
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
(self.right_entropy[word] + self.left_entropy[word])
(self.right_entropy[word] + self.left_entropy[word])/2 * score_3
elif use_avg and self.aggregation[word] > self.aggregation_min \
and (self.right_entropy[word] + self.left_entropy[word]) > 2 * self.entropy_min:
self.new_words[word] = {}
# {"aggregation":"agg", "right_entropy":"r", "left_entropy":"l", "frequency":"f", "score":"s"}
self.new_words[word]["a"] = self.aggregation[word]
self.new_words[word]["r"] = self.right_entropy[word]
self.new_words[word]["l"] = self.left_entropy[word]
self.new_words[word]["f"] = value / self.total_words
self.new_words[word]["s"] = self.new_words[word]["f"] * self.new_words[word]["a"] * \
(self.right_entropy[word] + self.left_entropy[word])

self.new_words[word]["f"] = value
# word-liberalization
m1 = lambda_0(self.right_entropy[word])
m2 = lambda_0(self.left_entropy[word])
m3 = lambda_0(self.aggregation[word])
score_3 = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
self.new_words[word]["ns"] = score_3
self.new_words[word]["s"] = self.new_words[word]["a"] * (self.right_entropy[word] + self.left_entropy[word])/2
# mul, 相乘
self.new_words[word]["s"] *= score_3
# 排序
new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
self.new_words = OrderedDict(new_words)
return self.new_words


if __name__ == '__main__':
text = "PageRank算法简介。" \
"是上世纪90年代末提出的一种计算网页权重的算法! " \
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
"业界急需一种相对比较准确的网页重要性计算方法。 " \
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
"和投票目标的等级来决定新的等级。简单的说, " \
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
# wc = count_word(text)
# path = "data/poet_tangsong.csv"
# wd = WordDiscovery()
# res = wd.find_word(text=path, use_type="file", freq_min=2, len_max=6, entropy_min=1.2, aggregation_min=0.4)
# from macropodus.preprocess.tools_common import txt_write
# import json
# res_s = json.dumps(res)
# txt_write([res_s], "res_s.txt")
# print(res)
# with open("res_s.txt", "r", encoding="utf-8") as fd:
# ff = fd.readlines()[0]
# res_ = json.loads(ff)
# res_soft = sorted(res_.items(), key=lambda d: d[1]['score'], reverse=True)

from macropodus.preprocess.tools_common import save_json, load_json, txt_write, txt_read

summary = "四川发文取缔全部不合规p2p。字节跳动与今日头条。成都日报,成都市,李太白与杜甫" \
"PageRank算法简介。" \
"是上世纪90年代末提出的一种计算网页权重的算法! " \
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
"业界急需一种相对比较准确的网页重要性计算方法。 " \
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
"和投票目标的等级来决定新的等级。简单的说, " \
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "

# 新词发现-文本
wd = WordDiscovery()
res = wd.find_word(text=text, use_type="text", use_avg=True, freq_min=2, len_max=7, entropy_min=0.4, aggregation_min=1.2)
res = wd.find_word(text=summary, use_type="text", use_avg=False, use_filter=False, use_output=True,
freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2)
for k, v in res.items():
print(k, v)
print("\n#################\n")

while True:
print("请输入:")
ques = input()
res = wd.find_word(text=ques, use_type="text", use_avg=True, freq_min=2, len_max=7, entropy_min=0.52, aggregation_min=1.2)
res = wd.find_word(text=ques, use_type="text", use_avg=False, use_filter=False, use_output=True,
freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2)
for k, v in res.items():
print(k, v)
# gg = 0
# ms = 0
3 changes: 3 additions & 0 deletions test/evaluate/tet_macropodus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
# @author : Mo
# @function: test macropodus

# import os
# os.environ['TF_KERAS'] = '1'


import time
time_start = time.time()
Expand Down

0 comments on commit 9de38c0

Please sign in to comment.