-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathgetUserData.py
152 lines (139 loc) · 3.74 KB
/
getUserData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sqlite3
import requests
# conn = sqlite3.connect('userData.sqlite')
# print "Opened database successfully"
# c = conn.cursor()
# c.execute('''CREATE TABLE COMPANY
# (
# ID INT NOT NULL,
# DOUYIN_ID CHAR(20),
# NAME CHAR(20),
# SIGNA CHAR(120),
# LOCATION CHAR(10),
# CONSTELLATION CHAR(5),
# FOCUS INT,
# FOLLOWER INT,
# Liked INT,
# VIDEO INT,
# like INT
# );''')
# print "Table created successfully"
# conn.commit()
# conn.close()
# 抖音密码字典
mapList = {
'xe60d': '0',
'xe616': '0',
'xe610': '2',
'xe61a': '3',
'xe618': '1',
'xe611': '3',
'xe60e': '1',
'xe602': '1',
'xe605': '2',
'xe617': '2',
'xe613': '7',
'xe604': '3',
'xe60c': '4',
'xe619': '4',
'xe606': '4',
'xe607': '5',
'xe61b': '5',
'xe60f': '5',
'xe608': '6',
'xe61f': '6',
'xe612': '6',
'xe603': '0',
'xe61c': '7',
'xe60a': '7',
'xe60b': '8',
'xe615': '9',
'xe614': '8',
'xe61d': '8',
'xe61e': '9',
'xe609': '9',
'.': '.',
'w': 'w'
}
# 截取字符串
def subString(str, start, end, startInd = 0):
startIndex = str.find(start.decode('utf-8'), startInd)
if (startIndex == -1): return ''
# print(startIndex)
endIndex = str.find(end.decode('utf-8'), startIndex + 1)
if (endIndex == -1): return ''
# print(endIndex)
# print(str[startIndex + len(start):endIndex])
return str[startIndex + len(start.decode('utf-8')):endIndex]
# 截取字符串组
def subStringArr(str, start, end):
arr = []
nextIndex = 0
while True:
nextIndex = str.find(start.decode('utf-8'), nextIndex + 1)
if nextIndex == -1:
return arr
temp = subString(str, start, end, nextIndex)
if str == '':
return arr
arr.append(temp)
# print start.decode('utf-8')
# print nextIndex
# endIndex = str.index(end.decode('utf-8'), startIndex + 1)
# print(endIndex)
## 字体解码方法
def fontDecrypt(arr):
str = ''
for item in arr:
if (item in mapList):
str += mapList[item]
else:
print item
return str
## 数字字体特殊处理
def subFontHtml(str, start):
html = subString(res.text, start, '<span class="text"')
# 针对小数点做特殊处理
html = html.replace('.', '&#.;')
# 针对万做特殊处理
html = html.replace('</i>w </span>', '&#w;')
font = subStringArr(html, '&#', ';')
return fontDecrypt(font)
# 抖音ID特殊处理
def subDouyinID(str):
str = subString(str, '抖音ID: ', '</p>')
# 清洗掉所有空格
str = str.replace(' ', '')
# 清洗出文本
str = str.replace('<iclass="iconiconfont">', '')
str = str.replace('</i>', '')
# 将字体清洗为字符串
for fontKey in mapList:
str = str.replace('&#' + fontKey + ';', mapList[fontKey])
return str
# 没有headers会返回页面不存在
headers = {
"user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
res = requests.get('https://www.douyin.com/share/user/6796248446', headers = headers)
res.encoding = 'utf-8'
# print(res.text)
# 名字
print(subString(res.text, 'class="nickname">', '</p>'))
# 抖音ID
print(subDouyinID(res.text))
# 签名
# print(subString(res.text, '"signature">', '</p>'))
# 城市
print(subString(res.text, '"location">', '</span>'))
# 星座
print(subString(res.text, '"constellation">', '</span>'))
# 关注
print(subFontHtml(res.text, 'class="focus block">'))
# 粉丝
print(subFontHtml(res.text, 'class="follower block">'))
# 粉丝
print(subFontHtml(res.text, 'class="liked-num block">'))
# 作品