{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import re\n", "\n", "class MyCrawler:\n", " def __init__(self, filename):\n", " self.filename = filename\n", " self.headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',\n", " }\n", " \n", " def download(self, url):\n", " r = requests.get(url, headers=self.headers)\n", " return r.text\n", " \n", " def extract(self, content, pattern):\n", " result = re.findall(pattern, content)\n", " return result\n", " \n", " def save(self, info):\n", " with open(self.filename, 'a', encoding='utf-8') as f:\n", " for item in info:\n", " f.write('|||'.join(item) + '\\n')\n", " \n", " def crawl(self, url, pattern, headers=None):\n", " if headers:\n", " self.headers.update(headers)\n", " content = self.download(url)\n", " info = self.extract(content, pattern)\n", " self.save(info)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "url = 'https://book.douban.com/tag/?view=type'\n", "content = douban_crawler.download(url)\n", "tree = html.fromstring(content)\n", "tags = tree.xpath(\"//td/a/text()\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'%E5%B0%8F%E8%AF%B4'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urllib.parse.quote(tags[0])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Current tag: 小说\n", "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T\n", "Last Start ID: 7600\n", "活着\n", "房思琪的初恋乐园\n", "白夜行\n", "解忧杂货店\n", "红楼梦\n", "追风筝的人\n", "百年孤独\n", "小王子\n", "围城\n", "平凡的世界(全三部)\n", "嫌疑人X的献身\n", "霍乱时期的爱情\n", "1984\n", "飘\n", "月亮与六便士\n", "三体: “地球往事”三部曲之一\n", "三体全集: 地球往事三部曲\n", "局外人\n", "杀死一只知更鸟\n", "骆驼祥子\n", "------------------------------------\n", "Current tag: 外国文学\n", "https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6?start=0&type=T\n", "Last Start ID: 7640\n", "小王子\n", "追风筝的人\n", "百年孤独\n", "飘\n", "1984\n", "霍乱时期的爱情\n", "月亮与六便士\n", "月亮和六便士\n", "杀死一只知更鸟\n", "傲慢与偏见\n", "局外人\n", "动物农场\n", "安徒生童话故事集\n", "简爱(英文全本)\n", "老人与海\n", "基督山伯爵\n", "哈利•波特\n", "一个陌生女人的来信\n", "牧羊少年奇幻之旅\n", "肖申克的救赎\n", "------------------------------------\n", "Current tag: 文学\n", "https://book.douban.com/tag/%E6%96%87%E5%AD%A6?start=0&type=T\n", "Last Start ID: 7640\n", "你当像鸟飞往你的山\n", "房思琪的初恋乐园\n", "小王子\n", "红楼梦\n", "百年孤独\n", "追风筝的人\n", "围城\n", "活着\n", "平凡的世界(全三部)\n", "解忧杂货店\n", "撒哈拉的故事\n", "霍乱时期的爱情\n", "月亮和六便士\n", "1984\n", "边城\n", "局外人\n", "许三观卖血记\n", "白鹿原: 20周年精装典藏版\n", "沉默的大多数: 王小波杂文随笔全编\n", "云边有个小卖部\n", "------------------------------------\n", "Current tag: 经典\n", "https://book.douban.com/tag/%E7%BB%8F%E5%85%B8?start=0&type=T\n", "Last Start ID: 7820\n", "活着\n", "小王子\n", "红楼梦\n", "百年孤独\n", "围城\n", "飘\n", "平凡的世界(全三部)\n", "三体全集: 地球往事三部曲\n", "骆驼祥子\n", "月亮与六便士\n", "哈利•波特\n", "杀死一只知更鸟\n", "霍乱时期的爱情\n", "傲慢与偏见\n", "1984\n", "追风筝的人\n", "边城\n", "安徒生童话故事集\n", "围城\n", "白鹿原: 20周年精装典藏版\n", "------------------------------------\n", "Current tag: 中国文学\n", "https://book.douban.com/tag/%E4%B8%AD%E5%9B%BD%E6%96%87%E5%AD%A6?start=0&type=T\n", "Last Start ID: 7720\n", "活着\n", "围城\n", "平凡的世界(全三部)\n", "骆驼祥子\n", "边城\n", "城南旧事: 纪念普及版\n", "明朝那些事儿(1-9): 限量版\n", "撒哈拉的故事\n", "红楼梦\n", "白鹿原: 20周年精装典藏版\n", "许三观卖血记\n", "三体全集: 地球往事三部曲\n", "呐喊\n", "房思琪的初恋乐园\n", "平凡的世界\n", "围城\n", "沉默的大多数: 王小波杂文随笔全编\n", "许三观卖血记\n", "朝花夕拾\n", "人生海海\n", "------------------------------------\n" ] } ], "source": [ "import re\n", "import time\n", "import requests\n", "from lxml import html\n", "import urllib.parse\n", "\n", "douban_crawler = MyCrawler('douban.txt')\n", "\n", "tag_list_url = 'https://book.douban.com/tag/?view=type'\n", "tag_content = douban_crawler.download(tag_list_url)\n", "tag_tree = html.fromstring(tag_content)\n", "tags = tag_tree.xpath(\"//td/a/text()\")\n", "for tag in tags[:5]:\n", " print('Current tag:', tag)\n", " tag = urllib.parse.quote(tag)\n", " page_id = 1\n", " last_start = 0\n", " while 1:\n", " start_id = 20 * (page_id - 1)\n", " url = 'https://book.douban.com/tag/{}?start={}&type=T'.format(tag, start_id)\n", " print(url)\n", " content = douban_crawler.download(url)\n", " tree = html.fromstring(content)\n", " if page_id == 1:\n", " page_links = tree.xpath(\"//div[@class='paginator']/a[last()]/@href\")\n", " if page_links:\n", " last_start = int(re.findall('start=(\\d+)', page_links[0])[0])\n", " print('Last Start ID: ', last_start)\n", " book_infos = tree.xpath(\"//li[@class='subject-item']\")\n", " for book_info in book_infos:\n", " book_name_elem = book_info.xpath('.//h2/a')[0]\n", " book_name = re.sub('\\s{2,}', '', book_name_elem.text_content().replace('\\n', ''))\n", " book_url = book_name_elem.attrib['href']\n", " book_pub_info = book_info.xpath(\".//div[@class='pub']\")[0].text.strip()\n", " book_intro = 'N/A'\n", " book_intro_elem = book_info.xpath(\".//div[@class='info']/p\")\n", " if book_intro_elem:\n", " book_intro = book_intro_elem[0].text.strip()\n", " print(book_name)\n", " page_id += 1\n", " if start_id == last_start:\n", " break\n", " print('------------------------------------')\n", " break\n", " time.sleep(1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "urls = [f'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={start_id}&type=T' for start_id in range(0, 200, 20)]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T',\n", " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urls" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54058 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52973 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52753 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52622 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54098 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53460 bytes\n", "Wall time: 1.11 s\n" ] } ], "source": [ "%%time\n", "\n", "import concurrent.futures\n", "import requests\n", "\n", "# URLS = ['http://www.163.com/',\n", "# 'http://www.sina.com.cn/',\n", "# 'http://baidu.com/',\n", "# 'http://youdao.com/',\n", "# 'http://bing.com/']\n", "\n", "douban_crawler = MyCrawler('douban.txt')\n", "\n", "# Retrieve a single page and report the URL and contents\n", "def load_url(url):\n", " global douban_crawler\n", " return douban_crawler.download(url)\n", "\n", "# We can use a with statement to ensure threads are cleaned up promptly\n", "with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:\n", " # Start the load operations and mark each future with its URL\n", " future_to_url = {executor.submit(load_url, url): url for url in urls}\n", " for future in concurrent.futures.as_completed(future_to_url):\n", " url = future_to_url[future]\n", " try:\n", " data = future.result()\n", " except Exception as exc:\n", " print('%r generated an exception: %s' % (url, exc))\n", " else:\n", " print('%r page is %d bytes' % (url, len(data)))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52753 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52973 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54058 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52622 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54098 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53460 bytes\n", "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes\n", "Wall time: 2.69 s\n" ] } ], "source": [ "%%time\n", "\n", "import concurrent.futures\n", "\n", "# URLS = ['http://www.163.com/',\n", "# 'http://www.sina.com.cn/',\n", "# 'http://baidu.com/',\n", "# 'http://youdao.com/',\n", "# 'http://bing.com/']\n", "\n", "for url in urls:\n", " data = douban_crawler.download(url)\n", " print('%r page is %d bytes' % (url, len(data)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 4 }