{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### 爬虫基类" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import re\n", "\n", "class MyCrawler:\n", " def __init__(self, filename):\n", " self.filename = filename\n", " self.headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',\n", " }\n", " \n", " def download(self, url):\n", " r = requests.get(url, headers=self.headers)\n", " return r.text\n", " \n", " def extract(self, content, pattern):\n", " result = re.findall(pattern, content)\n", " return result\n", " \n", " def save(self, info):\n", " with open(self.filename, 'a', encoding='utf-8') as f:\n", " for item in info:\n", " f.write('|||'.join(item) + '\\n')\n", " \n", " def crawl(self, url, pattern, headers=None):\n", " if headers:\n", " self.headers.update(headers)\n", " content = self.download(url)\n", " info = self.extract(content, pattern)\n", " self.save(info)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from lxml import html" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting package metadata (current_repodata.json): ...working... done\n", "Solving environment: ...working... done\n", "\n", "# All requested packages already installed.\n", "\n" ] } ], "source": [ "!conda install requests" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "douban_crawler = MyCrawler('douban.txt')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "content = douban_crawler.download('https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "tree = html.fromstring(content)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "book_names = tree.xpath('//h2/a')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'神经网络与深度学习'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "book_names[0].text.strip()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "xpath_str = " ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['神经网络与深度学习',\n", " 'Python神经网络编程',\n", " 'Python深度学习',\n", " '深度学习入门',\n", " '深度学习的数学',\n", " '数字思维',\n", " '深入浅出图神经网络:GNN原理解析',\n", " 'Neural Networks and Deep Learning',\n", " '神经网络设计(原书第2版)',\n", " '神经网络在应用科学和工程中的应用',\n", " 'Make Your Own Neural Network',\n", " '图解深度学习与神经网络:从张量到TensorFlow实现',\n", " 'MATLAB神经网络43个案例分析',\n", " 'Neural Networks and Learning Machines',\n", " '神经网络原理(原书第2版)',\n", " '连接组:造就独一无二的你',\n", " '神经网络控制',\n", " 'Neural Networks and Statistical Learning',\n", " '意识的宇宙',\n", " 'Unsupervised Learning']" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(map(lambda x: x.text.strip(), tree.xpath(\"//h2/a\")))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['邱锡鹏 / 机械工业出版社 / 2020-4-10 / 149.00元',\n", " '[英]塔里克·拉希德(Tariq Rashid) / 林赐 / 人民邮电出版社 / 2018-4 / 69.00元',\n", " '[美] 弗朗索瓦•肖莱 / 张亮 / 人民邮电出版社 / 2018-8 / 119.00元',\n", " '[ 日] 斋藤康毅 / 陆宇杰 / 人民邮电出版社 / 2018-7 / 59.00元',\n", " '[日]涌井良幸、[日]涌井贞美 / 杨瑞龙 / 人民邮电出版社 / 2019-4 / 69.00元',\n", " '[葡] 阿林多•奥利维拉 / 胡小锐 / 中信出版社 / 2020-1-1 / 69.00',\n", " '刘忠雨\\u3000李彦霖\\u3000周洋\\u3000著 / 机械工业出版社 / 2019-12-25 / 89元',\n", " 'Michael Nielsen / 2016-1',\n", " 'Martin T. Hagan、Howard B. Demuth、Mark H. Beale / 章毅 / 机械工业出版社 / 2017-11 / 99.00元',\n", " '萨马拉辛荷 / 2010-1 / 88.00元',\n", " 'Tariq Rashid / CreateSpace Independent Publishing Platform / 2016-3-31 / USD 45.00',\n", " '张平 / 电子工业出版社 / 2018-10 / 79.00元',\n", " '王小川、史峰、郁磊、李洋 / 北京航空航天大学出版社 / 2013-8-1 / CNY 48.00',\n", " 'Simon O. Haykin / Pearson / 2008-11-28 / USD 252.40',\n", " 'Simon Haykin / 叶世伟、史忠植 / 机械工业出版社 / 2004-1 / 69.00元',\n", " '[美] 承现峻 / 孙天齐 / 清华大学出版社 / 2016-1 / 45',\n", " '徐丽娜 / 2009-7 / 28.00元',\n", " 'Ke-Lin Du、M. N. S. Swamy / Springer / 2013-12-7 / USD 129.00',\n", " '[美] 杰拉尔德·埃德尔曼、[美] 朱利欧·托诺尼 / 顾凡及 / 上海科学技术出版社 / 2004-1 / 27.00元',\n", " 'A Bradford Book / 1999-6-11 / USD 40.00']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(map(lambda x: x.text.strip(), tree.xpath(\"//div[@class='pub']\")))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['本书主要介绍神经网络与深度学习中的基础知识、主要模型(卷积神经网络、递归神经网络等)以及在计算机视觉、自然语言处理等领域的应用。',\n", " '神经网络是一种模拟人脑的神经网络,以期能够实现类人工智能的机器学习\\n技术。\\n本书揭示神经网络背后的概念,并介绍如何通过Python实现神经网络。全书\\n分为3...',\n", " '本书由Keras之父、现任Google人工智能研究员的弗朗索瓦•肖莱(François Chollet)执笔,详尽介绍了用Python和Keras进行深度学...',\n", " '本书是深度学习真正意义上的入门书,深入浅出地剖析了深度学习的原理和相关技术。书中使用Python3,尽量不依赖外部库或工具,从基本的数学知识出发,带领读者从...',\n", " '《深度学习的数学》基于丰富的图示和具体示例,通俗易懂地介绍了深度学习相关的数学知识。第1章介绍神经网络的概况;第2章介绍理解神经网络所需的数学基础知识;第3...',\n", " '计算机、细胞和大脑有什么共同之处?计算机是人类设计的电子设备,细胞是经自然进化和选择产生的生物实体,大脑是人类思维的创造者和“容器”。但在某种程度上,它们都...',\n", " '这是一本从原理、算法、实现、应用4个维度详细讲解图神经网络的著作,在图神经网络领域具有重大的意义。\\n本书作者是图神经网络领域的资深技术专家,作者所在的公司极...',\n", " 'http://neuralnetworksanddeeplearning.com/',\n", " '本书是一本易学易懂的神经网络教材,主要讨论网络结构、学习规则、训练技巧和工程应用,紧紧围绕“设计”这一视角组织材料和展开讲解,强调基本原理和训练方法,概念清...',\n", " '《神经网络在应用科学与工程中的应用:从基本原理到复杂的模式识别》为读者提供了神经网络方面简单但却系统的介绍。\\n《神经网络在应用科学和工程中的应用从基本原理到...',\n", " '《图解深度学习与神经网络:从张量到TensorFlow实现》是以TensorFlow 为工具介绍神经网络和深度学习的入门书,内容循序渐进,以简单示例和图例的...',\n", " 'For graduate-level neural network courses offered in the departments of Comput...',\n", " '★《华尔街日报》2012年度十佳非虚构图书\\n★亚马逊网站2012年编辑选择之百佳图书\\n★《出版人周刊》2012年春季十佳科学类图书\\n【内容简介】\\n基因组让你...',\n", " '神经网络控制已发展成为“智能控制”的一个新的分支,属先进控制技术,为解决复杂的非线性、不确定、不确知系统的控制问题,开辟了一条新的途径。《神经网络控制(第3...',\n", " '本书对意识理论进行全面研究,建立在近代神经科学基础上、致力于对意识的产生、及人们对意识的认识如何帮助其“把严格的科学描述与人类知识和经验的宽广领域联系起来”...',\n", " 'Since its founding in 1989 by Terrence Sejnowski, Neural Computation has becom...']" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(map(lambda x: x.text.strip(), tree.xpath(\"//div[@class='info']/p\")))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[1;31mInit signature:\u001b[0m \u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m/\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mDocstring:\u001b[0m \n", "map(func, *iterables) --> map object\n", "\n", "Make an iterator that computes the function using arguments from\n", "each of the iterables. Stops when the shortest iterable is exhausted.\n", "\u001b[1;31mType:\u001b[0m type\n", "\u001b[1;31mSubclasses:\u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "map" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "book_infos = tree.xpath(\"//li[@class='subject-item']\")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "神经网络与深度学习 https://book.douban.com/subject/35044046/ 邱锡鹏 / 机械工业出版社 / 2020-4-10 / 149.00元 \n", " 本书主要介绍神经网络与深度学习中的基础知识、主要模型(卷积神经网络、递归神经网络等)以及在计算机视觉、自然语言处理等领域的应用。\n", "Python神经网络编程 https://book.douban.com/subject/30192800/ [英]塔里克·拉希德(Tariq Rashid) / 林赐 / 人民邮电出版社 / 2018-4 / 69.00元 \n", " 神经网络是一种模拟人脑的神经网络,以期能够实现类人工智能的机器学习\n", "技术。\n", "本书揭示神经网络背后的概念,并介绍如何通过Python实现神经网络。全书\n", "分为3...\n", "Python深度学习 https://book.douban.com/subject/30293801/ [美] 弗朗索瓦•肖莱 / 张亮 / 人民邮电出版社 / 2018-8 / 119.00元 \n", " 本书由Keras之父、现任Google人工智能研究员的弗朗索瓦•肖莱(François Chollet)执笔,详尽介绍了用Python和Keras进行深度学...\n", "深度学习入门 https://book.douban.com/subject/30270959/ [ 日] 斋藤康毅 / 陆宇杰 / 人民邮电出版社 / 2018-7 / 59.00元 \n", " 本书是深度学习真正意义上的入门书,深入浅出地剖析了深度学习的原理和相关技术。书中使用Python3,尽量不依赖外部库或工具,从基本的数学知识出发,带领读者从...\n", "深度学习的数学 https://book.douban.com/subject/33414479/ [日]涌井良幸、[日]涌井贞美 / 杨瑞龙 / 人民邮电出版社 / 2019-4 / 69.00元 \n", " 《深度学习的数学》基于丰富的图示和具体示例,通俗易懂地介绍了深度学习相关的数学知识。第1章介绍神经网络的概况;第2章介绍理解神经网络所需的数学基础知识;第3...\n", "数字思维 https://book.douban.com/subject/34941715/ [葡] 阿林多•奥利维拉 / 胡小锐 / 中信出版社 / 2020-1-1 / 69.00 \n", " 计算机、细胞和大脑有什么共同之处?计算机是人类设计的电子设备,细胞是经自然进化和选择产生的生物实体,大脑是人类思维的创造者和“容器”。但在某种程度上,它们都...\n", "深入浅出图神经网络:GNN原理解析 https://book.douban.com/subject/34927262/ 刘忠雨 李彦霖 周洋 著 / 机械工业出版社 / 2019-12-25 / 89元 \n", " 这是一本从原理、算法、实现、应用4个维度详细讲解图神经网络的著作,在图神经网络领域具有重大的意义。\n", "本书作者是图神经网络领域的资深技术专家,作者所在的公司极...\n", "Neural Networks and Deep Learning https://book.douban.com/subject/26727997/ Michael Nielsen / 2016-1 \n", " http://neuralnetworksanddeeplearning.com/\n", "神经网络设计(原书第2版) https://book.douban.com/subject/30236893/ Martin T. Hagan、Howard B. Demuth、Mark H. Beale / 章毅 / 机械工业出版社 / 2017-11 / 99.00元 \n", " 本书是一本易学易懂的神经网络教材,主要讨论网络结构、学习规则、训练技巧和工程应用,紧紧围绕“设计”这一视角组织材料和展开讲解,强调基本原理和训练方法,概念清...\n", "神经网络在应用科学和工程中的应用 https://book.douban.com/subject/4146246/ 萨马拉辛荷 / 2010-1 / 88.00元 \n", " 《神经网络在应用科学与工程中的应用:从基本原理到复杂的模式识别》为读者提供了神经网络方面简单但却系统的介绍。\n", "《神经网络在应用科学和工程中的应用从基本原理到...\n", "Make Your Own Neural Network https://book.douban.com/subject/26945232/ Tariq Rashid / CreateSpace Independent Publishing Platform / 2016-3-31 / USD 45.00 \n", " N/A\n", "图解深度学习与神经网络:从张量到TensorFlow实现 https://book.douban.com/subject/30333961/ 张平 / 电子工业出版社 / 2018-10 / 79.00元 \n", " 《图解深度学习与神经网络:从张量到TensorFlow实现》是以TensorFlow 为工具介绍神经网络和深度学习的入门书,内容循序渐进,以简单示例和图例的...\n", "MATLAB神经网络43个案例分析 https://book.douban.com/subject/26388161/ 王小川、史峰、郁磊、李洋 / 北京航空航天大学出版社 / 2013-8-1 / CNY 48.00 \n", " N/A\n", "Neural Networks and Learning Machines https://book.douban.com/subject/2584657/ Simon O. Haykin / Pearson / 2008-11-28 / USD 252.40 \n", " For graduate-level neural network courses offered in the departments of Comput...\n", "神经网络原理(原书第2版) https://book.douban.com/subject/1138922/ Simon Haykin / 叶世伟、史忠植 / 机械工业出版社 / 2004-1 / 69.00元 \n", " N/A\n", "连接组:造就独一无二的你 https://book.douban.com/subject/26666358/ [美] 承现峻 / 孙天齐 / 清华大学出版社 / 2016-1 / 45 \n", " ★《华尔街日报》2012年度十佳非虚构图书\n", "★亚马逊网站2012年编辑选择之百佳图书\n", "★《出版人周刊》2012年春季十佳科学类图书\n", "【内容简介】\n", "基因组让你...\n", "神经网络控制 https://book.douban.com/subject/3890040/ 徐丽娜 / 2009-7 / 28.00元 \n", " 神经网络控制已发展成为“智能控制”的一个新的分支,属先进控制技术,为解决复杂的非线性、不确定、不确知系统的控制问题,开辟了一条新的途径。《神经网络控制(第3...\n", "Neural Networks and Statistical Learning https://book.douban.com/subject/26422529/ Ke-Lin Du、M. N. S. Swamy / Springer / 2013-12-7 / USD 129.00 \n", " N/A\n", "意识的宇宙 https://book.douban.com/subject/1159821/ [美] 杰拉尔德·埃德尔曼、[美] 朱利欧·托诺尼 / 顾凡及 / 上海科学技术出版社 / 2004-1 / 27.00元 \n", " 本书对意识理论进行全面研究,建立在近代神经科学基础上、致力于对意识的产生、及人们对意识的认识如何帮助其“把严格的科学描述与人类知识和经验的宽广领域联系起来”...\n", "Unsupervised Learning https://book.douban.com/subject/6529821/ A Bradford Book / 1999-6-11 / USD 40.00 \n", " Since its founding in 1989 by Terrence Sejnowski, Neural Computation has becom...\n" ] } ], "source": [ "for book_info in book_infos:\n", " book_name_elem = book_info.xpath('.//h2/a')[0]\n", " book_name = book_name_elem.text.strip()\n", " book_url = book_name_elem.attrib['href']\n", " book_pub_info = book_info.xpath(\".//div[@class='pub']\")[0].text.strip()\n", " book_intro = 'N/A'\n", " book_intro_elem = book_info.xpath(\".//div[@class='info']/p\")\n", " if book_intro_elem:\n", " book_intro = book_intro_elem[0].text.strip()\n", " print(book_name, book_url, book_pub_info, '\\n', book_intro)\n", "# break" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://book.douban.com/subject/35044046/'" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "book_name_elem.attrib['href']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 4 }