forked from whnet/ZhihuSpyder
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataPersistent.py
More file actions
221 lines (195 loc) · 9.96 KB
/
DataPersistent.py
File metadata and controls
221 lines (195 loc) · 9.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import threading
import logging
import time
from Core.Logger import log
# 知乎用户信息字段
# 用户头像
USER_AVATAR_URL_TEMPLATE = 'avatarUrlTemplate'
# 用户标识
USER_URL_TOKEN = 'urlToken'
# 用户名
USER_NAME = 'name'
# 用户自我介绍
USER_HEADLINE = 'headline'
# 用户居住地
USER_LOCATIONS = 'locations'
# 用户所在行业
USER_BUSINESS = 'business'
# 用户职业经历
USER_EMPLOYMENTS = 'employments'
# 用户教育经历
USER_EDUCATIONS = 'educations'
# 用户个人描述
USER_DESCRIPTION = 'description'
# 用户性别
USER_GENDER = 'gender'
# 正在关注用户的数目
USER_FOLLOWING_COUNT = 'followingCount'
# 关注者的数目
USER_FOLLOWER_COUNT = 'followerCount'
# 该用户回答问题的数目
USER_ANSWER_COUNT = 'answerCount'
# 用户提问数目
USER_QUESTION_COUNT = 'questionCount'
# 用户获得赞同的数目
USER_VOTE_UP_COUNT = 'voteupCount'
# 关注关系字段
# 关注者
FOLLOW_FROM = 'followFrom'
# 被关注者
FOLLOW_TO = 'followTo'
# 负责用户数据的持久化
class DataPersistent:
__slots__ = ('persistent_cache_size', 'db_connection', 'redis_connection', 'persistent_thread',
'follow_relation_persistent_cache_size')
# 初始化
def __init__(self, persistent_cache_size, follow_relation_persistent_cache_size, db_connection, redis_connection):
# 设置用户信息数据持久化缓存大小
self.persistent_cache_size = persistent_cache_size
# 设置用户关注关系持久化缓存大小
self.follow_relation_persistent_cache_size = follow_relation_persistent_cache_size
# 设置数据库连接
self.db_connection = db_connection
# 设置Redis连接
self.redis_connection = redis_connection
# 创建数据库持久化线程
self.persistent_thread = PersistentThread(self.db_connection,
self.redis_connection,
self.persistent_cache_size,
self.follow_relation_persistent_cache_size)
if log.isEnabledFor(logging.INFO):
log.info('DataPersistent 模块初始化完毕')
def get_current_user_info_num(self):
return self.persistent_thread.get_current_user_info_num()
# 启动DataPersistent模块
def start_data_persistent(self):
# 启动线程
self.persistent_thread.start()
if log.isEnabledFor(logging.INFO):
log.info('DataPersistent 模块启动成功')
# 线程异常检测并重启
def check_and_restart(self):
if self.persistent_thread.thread_status == 'error':
self.persistent_thread = PersistentThread(self.db_connection, self.redis_connection,
self.persistent_cache_size,
self.follow_relation_persistent_cache_size)
self.persistent_thread.start()
if log.isEnabledFor(logging.INFO):
log.info('DataPersistent模块持久化线程中重新启动')
# 用户信息数据插入SQL语句
INSERT_USER_INFO = 'insert ignore into user_info(user_avator_url, user_token, user_name, user_headline, ' \
'user_location, user_business, user_employments, user_educations, user_description, ' \
'user_gender, user_following_count, user_follower_count, user_answer_count, ' \
'user_question_count, user_voteup_count) values(%s,%s,%s,%s,%s,%s,%s,%s,' \
'%s,%s,%s,%s,%s,%s,%s)'
# 用户关注关系数据插入SQL语句
INSERT_FOLLOW_RELATION = 'insert ignore into follow_relation(follow_from, follow_in) values(%s, %s)'
# 用户信息数量查询语句
COUNT_USER_INFO = 'select count(*) from user_info'
class PersistentThread(threading.Thread):
def __init__(self, db_connection, redis_connection, persistent_cache_size, follow_relation_persistent_cache_size):
threading.Thread.__init__(self)
# 设置数据库连接
self.db_connection = db_connection
# 设置Redis连接
self.redis_connection = redis_connection
# 设置用户信息缓存大小
self.persistent_cache_size = persistent_cache_size
# 设置关注关系缓存大小
self.follow_relation_persistent_cache_size = follow_relation_persistent_cache_size
# 数据持久化缓存队列名称
self.persistent_cache = 'persistentCache'
# 关注关系持久化缓存队列名称
self.follow_relation_persistent_cache = 'followRelationPersistentCache'
# 线程状态
self.thread_status = 'working'
# Operation Lock
self.lock = threading.Lock()
def get_current_user_info_num(self):
self.lock.acquire()
# 获取存放在数据库中的用户数量
cursor = self.db_connection.cursor()
cursor.execute(COUNT_USER_INFO)
data = cursor.fetchone()
cursor.close()
user_num_db = data[0]
# 获取缓存队列中的用户数量
user_num_cache = self.redis_connection.llen(self.persistent_cache)
self.lock.release()
return user_num_cache + user_num_db
def run(self):
debug_info = None
try:
while True:
# 持久化用户信息
current_user_info_cache_size = self.redis_connection.llen(self.persistent_cache)
if current_user_info_cache_size >= self.persistent_cache_size:
self.lock.acquire()
cursor = self.db_connection.cursor()
for i in range(current_user_info_cache_size):
user_info = self.redis_connection.lpop(self.persistent_cache)
debug_info = user_info
if user_info is not None:
user_info = self.convert_user_info(eval(user_info.decode('utf-8')))
cursor.execute(INSERT_USER_INFO, [user_info[USER_AVATAR_URL_TEMPLATE],
user_info[USER_URL_TOKEN],
user_info[USER_NAME],
user_info[USER_HEADLINE],
user_info[USER_LOCATIONS],
user_info[USER_BUSINESS],
user_info[USER_EMPLOYMENTS],
user_info[USER_EDUCATIONS],
user_info[USER_DESCRIPTION],
user_info[USER_GENDER],
user_info[USER_FOLLOWING_COUNT],
user_info[USER_FOLLOWER_COUNT],
user_info[USER_ANSWER_COUNT],
user_info[USER_QUESTION_COUNT],
user_info[USER_VOTE_UP_COUNT]])
self.db_connection.commit()
cursor.close()
self.lock.release()
# 持久化关注关系
current_follow_relation_cache_size = self.redis_connection.llen(self.follow_relation_persistent_cache)
if current_follow_relation_cache_size >= self.follow_relation_persistent_cache_size:
self.lock.acquire()
cursor = self.db_connection.cursor()
for i in range(current_follow_relation_cache_size):
follow_relation = self.redis_connection.lpop(self.follow_relation_persistent_cache)
debug_info = follow_relation
if follow_relation is not None:
follow_relation = eval(follow_relation.decode('utf-8'))
cursor.execute(INSERT_FOLLOW_RELATION, [follow_relation[FOLLOW_FROM],
follow_relation[FOLLOW_TO]])
self.db_connection.commit()
cursor.close()
self.lock.release()
# 检查时间间隔
time.sleep(180)
except Exception as e:
if log.isEnabledFor(logging.ERROR):
log.error('用户数据持久化线程异常退出')
log.exception(e)
log.debug(debug_info)
self.thread_status = 'error'
# 将用户数据转换为适合数据库存储的格式
@staticmethod
def convert_user_info(user_info):
# 将居住地转换为‘;’分隔的字符串
locations_string = ';'.join(str(x) for x in user_info[USER_LOCATIONS])
user_info[USER_LOCATIONS] = locations_string
# 将职业经历转换为‘XXX(XXX)’,并以‘;’ 分隔的字符串
employments_list = []
for employment in user_info[USER_EMPLOYMENTS]:
temp = ''
if 'company' in employment:
temp += str(employment['company'])
if 'job' in employment:
temp += '-' + str(employment['job'])
employments_list.append(temp)
employments_string = ';'.join(str(x) for x in employments_list)
user_info[USER_EMPLOYMENTS] = employments_string
# 将教育经历转换为‘;’分隔的字符串
educations_string = ';'.join(str(x) for x in user_info[USER_EDUCATIONS])
user_info[USER_EDUCATIONS] = educations_string
return user_info