目录
一、MongoDB
1、下载、安装、配置环境变量、路径配置、启动
2、python操作MongoDB
二、Scrapy
1、安装、创建项目
2、修改、增加代码
①、编写 Spider 类
②、修改items.py
③、修改settings.py
④、修改pipelines.py
⑤、在tutorial下增加main.py
3、启动爬虫
4、成功
本机环境:python3,Windows10
我的视频讲解:https://www.bilibili.com/video/BV1xh411C7Xc/
a. 安装pymongo
pip install pymongo
b. 基本操作
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/") #使用MongoClient对象,连接数据库
collist = myclient.list_database_names() # 获取所有数据库
mydb = myclient["runoobdb"] # 数据库名
mycol = mydb["sites"] # collection集合(类似SQL的表)
# 插入一条数据
mydict = { "_id": 1, "name": "RUNOOB", "cn_name": "菜鸟教程"}
mycol.insert_one(mydict)
# 插入一组数据
mylist = [
{ "_id": 2, "name": "Google", "address": "Google 搜索"},
{ "_id": 3, "name": "Facebook", "address": "脸书"},
{ "_id": 4, "name": "Taobao", "address": "淘宝"},
{ "_id": 5, "name": "Zhihu", "address": "知乎"}
]
mycol.insert_many(mylist)
# 查找
for x in mycol.find({},{ "_id": 0, "name": 1, "alexa": 1 }):
print(x)
# 删除全部
mycol.delete_many({})
print(list(mycol.find()))
scrapy startproject tutorial
在tutorial/spiders目录下增加quotes_spider.py文件
import scrapy
from ..items import TutorialItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
# 也可以如下
# start_urls = ['http://quotes.toscrape.com/page/1/']
def start_requests(self):
url_bone = 'http://quotes.toscrape.com/page/{}/'
for i in range(1,3): # 要能够结束/有出口,否则为死循环
url = url_bone.format(i)
print('url: {}'.format(url))
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = TutorialItem()
div_list = response.xpath('/html/body/div/div[2]/div[1]/div')[:3]
for div in div_list:
words = div.xpath('./span[1]/text()').extract_first()
person = div.xpath('./span[2]/small/text()').extract_first()
item['words'] = words
item['person'] = person
yield item
通过xpath来定位:
a、F12 检查, 找到对应位置,右键copy-copy xpath
b、使用chrome插件 xpath helper
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
words = scrapy.Field() # 一个人说的话
person = scrapy.Field() # 说话的人
ROBOTSTXT_OBEY = False #修改为False
LOG_LEVEL = 'ERROR' #只有在错误时才打印日志
DOWNLOAD_DELAY = 1 # 下载时间间隔改为1秒
# 配置默认的请求头
DEFAULT_REQUEST_HEADERS = {
"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
# 配置使用Pipeline
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
}
import pymongo
class TutorialPipeline(object):
def __init__(self):
super().__init__()
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["runoobdb"]
self.mycol = mydb["quotes"]
def process_item(self, item, spider):
print(item)
self.mycol.insert_one(dict(item))
return item
from scrapy import cmdline
cmdline.execute('scrapy crawl quotes'.split())
cd到main.py的位置 xx/tutorial/tutorial
python main.py
参考资料:
scrapy官方手册中文版
MongoDB文档
https://blog.csdn.net/qq_41837900/article/details/96489994
待爬网址:http://quotes.toscrape.com/page/1/
本项目代码位置:https://download.csdn.net/download/GreatXiang888/15108875