forked from HansiZeng/RIPOR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check.py
34 lines (29 loc) · 1.59 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import ujson
input_file = 'bm25_tran_socre_for_title.json' # 替换为你的实际文件路径
output_file = 'bm25_tran_socre_for_title_cleaned.json'
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for i, line in enumerate(infile, start=1):
try:
# 尝试加载每一行 JSON 数据
data = ujson.loads(line)
# 检查是否存在 'scores' 字段,且长度至少为 100,同时确保 'scores' 和 'docids' 长度一致
if 'scores' in data and 'docids' in data:
if len(data['scores']) >= 100 and len(data['scores']) == len(data['docids']):
# 保存合法数据到输出文件
ujson.dump(data, outfile)
outfile.write('\n')
else:
# 输出具体缺陷信息
if len(data['scores']) < 100:
print(f"Insufficient scores in line {i}: {len(data['scores'])} scores found")
elif len(data['scores']) != len(data['docids']):
print(f"Mismatch in lengths for docids and scores in line {i}: {len(data['docids'])} docids, {len(data['scores'])} scores")
else:
if 'scores' not in data:
print(f"Missing 'scores' in line {i}")
if 'docids' not in data:
print(f"Missing 'docids' in line {i}")
except ujson.JSONDecodeError as e:
# 跳过无效的 JSON 行
print(f"Skipping invalid JSON in line {i}: {e}")
print(f"Cleaned data saved to {output_file}")