使用pipenv虚拟环境:
cd /Users/xiaoyigege/Desktop/Python/ptest
pipenv
环境和scrapy
框架pipenv install
url = “https://pypi.tuna.tsinghua.edu.cn/simple”
pipenv install scrapy
scrapy
pipenv run scrapy fetch “http://www.baidu.com”
在ptest文件下新建项目
pipenv run scrapy startproject ITcast
ITcast目录结构如下:
.
├── ITcast
│ ├── ITcast
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ ├── __init__.py
│ │ └── __pycache__
│ └── scrapy.cfg
├── Pipfile
└── Pipfile.lock
cd ITcast
pipenv run scrapy genspider itcast “www.itcast.cn”
/Users/xiaoyigege/Desktop/Python/ptest/Itcast
mkdir data
cd data
pipenv run scrapy crawl itcast
.
├── ITcast
│ ├── __init__.py
│ ├── __pycache__
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── __pycache__
│ └── itcast.py
├── data
│ └── itcast_pipeline.json
└── scrapy.cfg
#管道配置 ,数值越小优先级越高
ITEM_PIPELINES = {
'ITcast.pipelines.ItcastPipeline': 300,
'ITcast.pipelines.xxxPipeline': 400, # xxxPipeline为管道名称
}
import json #导入系统JSon模块
# 管道名称
class ItcastPipeline(object):
#初始化,在爬虫的生命周期内只会执行一次(存本地的时候要实现,不必要)
def __init__(self):
self.f = open("itcast_pipeline.json", "w")
# 管道实现,必须要的方法
def process_item(self, item, spider):
# 内容避免写在同一行直接拼接+",\n"
content = json.dumps(dict(item), ensure_ascii=False) + ",\n"
self.f.write(content)
return item
#在爬虫结束的时候关闭文件(不必要)
def close_spider(self,spider):
self.f.close()
# xxxPipeline :管道名称,在setting文件中使用
#class xxxPipeline(object):
import scrapy
class ItcastItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#老师姓名
name = scrapy.field()
#老师职称
title = scrapy.field()
#老师信息
info = scrapy.field()
# pass
import scrapy
from ITcast.items import ItcastItem #导入item模块
class ItcastSpider(scrapy.Spider):
# 爬虫名,启动爬虫时需要的参数*必需
name = 'itcast'
# 爬取域范围,允许爬虫在这个域名下进行爬取(可选)
allowed_domains = ['www.itcast.cn']
# 起始url列表,爬虫执行后第一批请求,将从这个列表里获取
start_urls = ['http://www.itcast.cn/channel/teacher.shtml']
def parse(self, response):
node_list = response.xpath("//div[@class='li_txt']")
# 用来存储所有的item字段的
for node in node_list
# 创建item字段对象,用来存储信息
item = ItcastItem()
# extract() 将xpath对象转换为 Unicode字符串
name = node.xpath("./h3/text()").extract()
title = node.xpath("./h4/text()").extract()
info = node.xpath("./p/text()").extract()
item['name'] = name[0]
item['title'] = title[0]
item['info'] = info[0]
# 返回提取到的每个item数据,给管道文件处理,同时还回来继续执行后面的代码(下一次for)
yield item
#pass
pipenv --venv #显示当前虚拟环境所用的解释器位置
/Users/xiaoyigege/.local/share/virtualenvs/ptest-uyD_8yIs/bin/python
到粘贴板(需要自己拼接/bin/python),粘贴到existing environment的interpreter下面,点击确定。在win7中缺少pypiwin32模块是需要安装pypiwin32环境:Scrapy执行crawl命令报错:ModuleNotFoundError: No module named ‘win32api’
在当前虚拟环境中安装 pypiwin32 依赖
pipenv install pypiwin32
在爬虫文件中:
def parse(self, response):
# 转换中文编码
html1 = response.body
html1 = html1.decode('utf-8')
print(html1)
# pass