forked from ls1248659692/python_guide
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdistributed_spider.py
More file actions
65 lines (45 loc) · 1.54 KB
/
distributed_spider.py
File metadata and controls
65 lines (45 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python
# coding=utf8
import multiprocessing as mp
import re
import time
from urllib import urlopen
from bs4 import BeautifulSoup
from future.backports.urllib.parse import urljoin
__author__ = 'Jam'
__date__ = '2019/7/5 17:03'
base_url = 'https://morvanzhou.github.io/'
def crawl(url):
response = urlopen(url)
time.sleep(1)
return response.read().decode('utf-8', 'ignore')
def parse(html):
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
title = soup.find('h1').get_text().strip()
page_urls = set([urljoin(base_url, url['href']) for url in urls])
url = soup.find('meta', {'property': "og:url"})['content']
return title, page_urls, url
def run():
unseen, seen = set(), set()
unseen.add(base_url)
pool = mp.Pool(4)
start = time.time()
while len(unseen) > 0:
if len(seen) > 20:
break
print('Distributed Crawling...')
crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
htmls = [job.get() for job in crawl_jobs]
print('Distributed Parsing...')
parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
results = [job.get() for job in parse_jobs]
print('Distributed Saving...')
seen.update(unseen)
unseen.clear()
for title, page_urls, url in results:
print(title, url)
unseen.update(page_urls - seen)
print('Total time: %.1f s' % (time.time() - start,))
if __name__ == '__main__':
run()