第一个爬虫
from urllib.request import urlopen
url = 'http://www.baidu.com'
response = urlopen(url)
print(response.read().decode())
print(response.getcode()) #返回状态码
print(response.geturl()) #实际访问的url
print(response.info()) #http响应头
get请求
from urllib.request import urlopen,Request
from urllib.parse import quote
from urllib.parse import urlencode
args = {
'wd':"尚学堂",
'ie':'utf-8'
}
print(urlencode(args)) #wd=%E5%B0%9A%E5%AD%A6%E5%A0%82&ie=utf-8
# url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(quote("尚学堂"))
url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(urlencode(args))
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
req = Request(url,headers=headers)
print(urlopen(req).read().decode())
post请求:
from urllib.request import urlopen,Request
from urllib.parse import urlencode
url = 'https://www.baidu.com/'
args = {
'user':'111111',
'password':'123456'
}
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
req = Request(url,headers=headers,data=urlencode(args))
print(urlopen(req).read().decode())
https请求问题
import ssl
context = ssl._create_unverified_context() #忽略ssl安全认证
print(urlopen(url,context=context).read().decode())
动态UserAgent的使用
from fake_useragent import UserAgent
ua = UserAgent()
print(ua.ie)
print(ua.chrome)
print(ua.random)
opener的使用
from urllib.request import urlopen,Request
url = "http://httpbin.org/get"
from fake_useragent import UserAgent
headers = {
"User-Agent": UserAgent().random
}
req = Request(url,headers=headers)
from urllib.request import build_opener,HTTPHandler
handler = HTTPHandler(debuglevel=1) #打印信息
opener = build_opener(handler)
resp = opener.open(req)
#print(resp.read().decode())
proxy代理的使用
from urllib.request import build_opener,ProxyHandler
# handler = ProxyHandler({"http":"name:password@ip:port"})
handler = ProxyHandler({"http":"211.137.52.158:8080"})
opener = build_opener(handler)
cookie使用
from urllib.request import HTTPCookieProcessor
handler = HTTPCookieProcessor() #可以保存cookie
opener = build_opener(handler)
cookie的保存与使用
#cookie的保存
from http.cookiejar import MozillaCookieJar
cookie_jar = MozillaCookieJar()
handler = HTTPCookieProcessor(cookie_jar)
opener = build_opener(handler)
resp = opener.open(req)
cookie_jar.set_cookie('cookie.txt', ignore_discard=True, ignore_expires=True)
#cookie的使用
from http.cookiejar import MozillaCookieJar
cookie_jar = MozillaCookieJar()
cookie_jar.load('cookie.txt', ignore_discard=True, ignore_expires=True)
handler = HTTPCookieProcessor(cookie_jar)
opener = build_opener(handler)
resp = opener.open(req)
捕获异常URLError
from urllib.error import URLError
requests的使用
get请求
import requests
url = "http://httpbin.org/get"
proxy = {
"http":"http://211.137.52.158:8080" #设置代理
}
headers = {"User-Agent":UserAgent().random}
resp = requests.get(url,headers=headers,proxies=proxy)
print(resp.url)
resp.encoding = 'utf-8'
print(resp.text)
session自动保存cookies
s = requests.Session()
# 用session对象发出get请求,设置cookies
s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')
ssl验证
# 禁用安全请求警告
requests.packages.urllib3.disable_warnings()
resp = requests.get(url, verify=False, headers=headers)
正则表达式的使用:https://blog.csdn.net/mingzme/article/details/107250157
f1 = re.match(r"/w",str)
s1 = re.sub(r"every_day","EveryDay",str) #t替换
数据提取-Beautiful Soup:https://blog.csdn.net/mingzme/article/details/107250908
from bs4 import BeautifulSoup
soup = BeautifulSoup(str, 'lxml')
print(soup.title)
a = soup.select('css表达式')[0].text #css表达式
a.get('href') #获得元素的属性
数据提取-XPath:https://blog.csdn.net/mingzme/article/details/107252400
from lxml import etree
url='https://www.qidian.com/rank/fengyun?style=1&year=2018&month=08'
headers = {"User-Agent":UserAgent().chrome}
resp = requests.get(url,headers=headers)
e = etree.HTML(resp.text)
names = e.xpath('//div[@class="book-mid-info"]/h4/a/text()')
authors = e.xpath('//p[@class="author"]/a[1]/text()')
for name,author in zip(names,authors):
print(name +":"+ author)
数据提取-PyQuery:https://blog.csdn.net/mingzme/article/details/107255479
from pyquery import PyQuery
doc = PyQuery(resp.text)
names = [a.text for a in doc('h4 a')] ()中是css表达式
print(names)
数据提取-jsonpath:https://blog.csdn.net/mingzme/article/details/107299928
json
str = '{"name":"盗梦空间"}'
obj = json.loads(str) #字符串转字典对象
obj_str = json.dumps(obj,ensure_ascii=False) #字典对象转字符串
#对象保存到文件
json.dump(obj,open('movie.txt','w',encoding='utf-8'),ensure_ascii=False)
obj2 = json.load(open('movie.txt',encoding='utf-8')) #文件转对象
jsonpath使用
from jsonpath import jsonpath
names = jsonpath(json.loads(resp.text), '$..name')
ids = jsonpath(resp.json(),"$..id")
Tesseract识别文字(需要安装)
import pytesseract
from PIL import Image
img = Image.open('yzm1.jpg')
str = pytesseract.image_to_string(img)
print(str)
selenium与PhantomJS游览器自动化插件:https://blog.csdn.net/mingzme/article/details/107303299
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless') #开启无头浏览器模式
options.add_argument('--proxy-server=http://ip:port') #设置代理
chrome = webdriver.Chrome(chrome_options=options)
chrome.get("https://cn.bing.com/")
chrome.find_element_by_id('sb_form_q').send_keys('python') #输入框输入需要查询内容
chrome.find_element_by_id('sb_form_go').click() #点击查询按钮
chrome.save_screenshot('baidu.png') #截图
js = 'document.documentElements.scrollTop=1000000'
chrome.execute_script(js) #拉滚动条
html = chrome.page_source #获取源代码
chrome.quit() #关闭浏览器
创建项目
scrapy startproject myfrist
创建爬虫
scrapy genspider 爬虫名 爬虫的地址
运行爬虫
scrapy crawl 爬虫名
scrapy crawl 爬虫名 -o douban.json -t json
#方法二
from scrapy.cmdline import execute
execute('scrapy crawl movie'.split())
案例
#movie.py
def parse(self, response):
names = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
stars = response.xpath('//span[@class="rating_num"]/text()').extract()
item = DoubanItem()
for name, star in zip(names, stars):
item['name'] = name
item['star'] = star
yield item'
#items.py
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
star = scrapy.Field()
#pipelines.py
from json import dumps
class DoubanPipeline:
def open_spider(self,spider): #爬虫开始时运行
self.filename = open('movies.txt','w',encoding='utf-8')
def process_item(self, item, spider):
self.filename.write(dumps(dict(item),ensure_ascii=False)+"\n")
return item
def close_spider(self,spider): #爬虫结束时运行
self.filename.close()
settings.py的设置内容:https://blog.csdn.net/mingzme/article/details/107322777
案例
#qu.py
class QuSpider(scrapy.Spider):
name = 'qu'
allowed_domains = ['qu.la']
start_urls = ['https://www.qu.la/book/4703/2014176.html']
def parse(self, response):
title = response.xpath('//h1/text()').extract_first()
content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace(' ','\n')
next_url = response.xpath('//div[@class="section-opt"]/a[3]/@href').extract_first()
yield{
'title':title,
'content':content
}
#继续爬取下一个url
yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
#pipelines.py
class FictionPipeline:
def open_spider(self,spider):
self.filename = open('fiction.txt','w',encoding='utf-8')
def process_item(self, item, spider):
info = item['title'] + '\n' + item['content'] + "\n"
self.filename.write(info+'\n\n\n')
self.filename.flush()
return item
def close_spider(self,spider):
self.filename.close()
crawlspider的使用
创建爬虫
scrapy genspider qu3 qu.la -t crawl
案例
class Qu3Spider(CrawlSpider):
name = 'qu3'
allowed_domains = ['qu.la']
start_urls = ['https://www.qu.la/book/4703/']
rules = (
Rule(LinkExtractor(restrict_xpaths=r'//*[@id="list"]/dl/dd[13]/a'), callback='parse_item', follow=True),#爬取第一章
Rule(LinkExtractor(restrict_xpaths=r'//div[@class="section-opt"]/a[3]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
title = response.xpath('//h1/text()').extract_first()
content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace(' ', '\n')
yield {
'title': title,
'content': content
}
imagepipline 下载图片
#zol.py
def parse(self, response):
image_url = response.xpath('//img[@id="bigImg"]/@src').extract_first()
image_name = response.xpath('string(//h3)').extract_first()
yield {
'image_urls' : [image_url], #不重写方法的话名字固定
'image_name' : image_name
}
next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
yield scrapy.Request(response.urljoin(next_url),callback=self.parse)
#pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class ImagePipeline(ImagesPipeline): #继承ImagesPipeline
def get_media_requests(self, item, info):
return Request(item['image_urls'], meta={'name' : item['image_name']})
def file_path(self, request, response=None, info=None): #改变图片名字
name = request.meta['name'].strip().replace('\r\n\t\t','')
name = name.replace('/','-')
return name+'.jpg'
#settings.py
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 300,
'image.pipelines.ImagePipeline' : 300, #重写方法的加进去
}
IMAGES_STORE='C:/Users/Mingz/Desktop/PythonLab/imagee'
中间件:动态UA设置
#middlewares.py
#from image.settings import USER_AGENTS
#from random import choice
class UserAgentMiddlerware:
def process_request(self, request, spider):
request.headers.setdefault(b'User-Agent',UserAgent().random)
#setting.py
DOWNLOADER_MIDDLEWARES = {
'image.middlewares.ImageDownloaderMiddleware': 343, #调高优先级
}
USER_AGENT=[
'a','b','c'
]
中间件:动态代理
#middlewares.py
class ProxyMiddlerware:
def process_request(self,request, spider):
request.meta['proxy'] = 'http://uname:password@ip:port'
#setting.py
DOWNLOADER_MIDDLEWARES = {
'image.middlewares.ProxyMiddlerware':344
}
登陆表单
class FilterSpider(scrapy.Spider):
name = 'filter'
allowed_domains = ['baicu.com']
def start_requests(self):
url = 'https://www.baidu.com'
form_data= {
'user':'user',
'password':'pwd'
}
for num in range(3):
yield scrapy.FormRequest(url, callback=self.parse, formdata=form_data, dont_filter=True,cookie="..")#True 不再去重 传表单 cookie要传入字典对象
mongodb数据库使用
show dbs
db.createCollection('student')
db.dropDatabase()
show tables
show collections #和show tables 一样
db.student.drop()
crud操作
db.student.save([{name:"刘备"},{name:"董卓"}]) #id重复则覆盖
db.student.insert({name:"刘备"}) #id重复则报错
db.student.update({name:"刘备"},{age:33,name:"刘备"})
db.student.update({name:"刘备"},{$set:{age:18}},{multi:true}) #更新多条
db.student.remove({name:"刘备"},{justOne:true})
db.student.remove({}) #删除所有数据
db.student.find().limit(3).skip(6).sort({age:1}) #1升序 -1降序
db.student.find({country:"魏国"}).count()
db.student.find({$or:[{age:{$lt:25}},{country:'魏国'}]}) #小于25的或者...
db.student.find({age:{$in:[25,28]}})
db.student.find({name:/^曹/}) #模糊匹配 姓曹的人
db.student.find({name:{$regex:"^曹"}})
db.student.find({$where:function(){return this.age>=23}}) #自定义查询
db.student.distinct('country') #去重
db.student.find({'age':{$exists:true}})
Mongo与Python的交互
from pymongo import MongoClient
client = MongoClient()
school = client.school #获取数据库实例
student = school.student #获取集合
stus = student.find()
print(stus.next())
stu = student.find_one({"country":"蜀国"})
stus = student.find().skip(6).limit(6)
# stus = student.find().sort("age",pymongo.DESCENDING)
stu = {"name":"诸葛亮","country":"蜀国"}
student.insert_one(stu)
student.update_one({"name":"诸葛亮"},{"$set":{"age":30}})
student.delete_many({"name":"诸葛亮"})
爬取数据保存到数据库
#Mongo数据库
from pymongo import MongoClient
class MongoDemoPipeline:
def open_spider(self,spider):
self.client = MongoClient()
self.db = self.client.movie
self.collection = self.db.collection
def process_item(self, item, spider):
self.collection.insert(item)
return item
def close_spider(self,spider):
self.client.close()
#Mysql 数据库
class MysqlPipeline:
def open_spider(self,spider):
self.client = connect(host='localhost', port=3306, user='root', password='root', db='test01')
self.cursor = self.client.cursor()
def process_item(self, item, spider):
sql = 'insert into t_maoyan values(0,%s,%s)'
self.cursor.execute(sql, [item['name'],item['star']])
self.client.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.client.close()
#setting.py
ITEM_PIPELINES = {
'mongo_demo.pipelines.MongoDemoPipeline': 300,
'mongo_demo.pipelines.MysqlPipeline': 301,
}
Splash与python:https://blog.csdn.net/mingzme/article/details/107339895
url = 'https://www.guazi.com/hengshui/buy/'
base_url = 'http://192.168.99.100:8050/render.html?url={}&wait=2'.format(url)
resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})
import requests
from urllib.parse import quote
from fake_useragent import UserAgent
url = 'https://www.guazi.com/hengshui/buy/'
lua_script = '''
function main(splash, args)
assert(splash:go('{}'))
assert(splash:wait(0.5))
return splash:html()
end
'''.format(url)
base_url = 'http://192.168.99.100:8050/execute?lua_source=' + quote(lua_script)
resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})
splash与scrapy
#settings.py
SPLASH_URL = 'http://192.168.99.100:8050/'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
#guaizi.py
from scrapy_splash import SplashRequest
class Guazi1Spider(scrapy.Spider):
name = 'guazi1'
allowed_domains = ['guazi.com']
def start_requests(self):
url = 'https://www.guazi.com/bj/buy'
yield SplashRequest(url, callback=self.parse,args={'wait':2})
def parse(self, response):
print(response.text)
#guaizi2.py
def start_requests(self):
url = 'https://www.guazi.com/hengshui/buy/'
lua_script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(0.5))
return splash:html()
end
'''
yield SplashRequest(url, callback=self.parse, endpoint='execute',args={'lua_source':lua_script})
selenium 与 scrapy的结合
#baidu.py
import scrapy
from selenium import webdriver
from scrapy import signals
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://www.baidu.com/']
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BaiduSpider, cls).from_crawler(crawler, *args, **kwargs) #初始化爬虫对象
spider.driver = webdriver.Chrome()
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)#捕捉信号
return spider
def spider_closed(self, spider):
spider.driver.close()
def parse(self, response):
print(response.text)
middlewares.py
from scrapy.http import HtmlResponse
class SeleniumMiddleware:
def process_request(self, request, spider):
spider.driver.get(request.url)
html = spider.driver.page_source
return HtmlResponse(url=request.url,body=html,request=request,encoding='utf-8') #不会再走后面和下载器
#setting.py
DOWNLOADER_MIDDLEWARES = {
'selenium_demo.middlewares.SeleniumMiddleware': 543,
}