爬虫总结
方法:
1、from urllib import request
·
1、访问网站
#url
url = ‘’
#请求url、headers、cook等
rsp = request.Request(url)
#添加头部
#req.add_header(‘User-Agent’,“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36”)
#访问网址
html = request.urlopen(req)
#打印访问的网址(重定向之后的)
print(html.geturl())
#打印headers
print(html.info())
#打印状态码
print(html.getcode)
#解码
html = html.read().decode()
·
2、cook
from http import cookiejar
cookie = cookiejar.MozillaCookieJar(fillname)
cookie_handler = request.HTTPCookieProcessor(cookie)
http_handler = request.HTTPHandler()
https_handler = request.HTTPSHandler()
#装载
operen = request.build_opener(http_handler,https_handler,cookie_handler)
#用operen访问
rsq = operen.open(req)
#保存cookie
cookie.save()
·
3、parse
data={
'’:"**"
}
data = parse.urlencode(data)
#注意编码
req = request.Request(url,data=data.encode(“utf-8”))
``
4、error
from urllib import error
try:
***
except error,URLError as e:
***
except Exception as e:
***
·
5、代理
proxy = {}
proxy[‘http’] = ‘119.179.130.59:8060’
proxy_hander = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_hander)
request.install_opener(opener)
eg:v13
eg:02-多个代理
eg:05_daili
·
6、ssl安全
import ssl
ssl._create_default_https_context = ssl._create_default_https_context
7、保存图片
img_src = ‘http://724.169pp.net/bizhi/2017/039/1.jpg’
request.urlretrieve(img_src,r’C:\Users\machenike\Desktop\1.jpg’)
#下载进度
def Schedule(blocknum, blocksize, totalsize):
‘’’
:param blocknum: 已下载的数据块
:param blocksize: 数据块大小
:param totalsize: 远程文件大小
:return:
‘’’
per = 100.0 * blocknum * blocksize / totalsize
if per > 100:
per = 100
print(‘当前下载进度为:{}%’.format(int(per)))
#加上进度函数
request.urlretrieve(src, path + ‘/’ + src.split(’/’)[-1], Schedule)
2、import requests
1、访问
url = “http://www.baidu.com”
#get请求
rsp = requests.get(url)
#打印内容
print(rsp.text)
2、代理
proxy = {
“http”:“39.137.107.98:80”
}
#get方法,url,代理
rsp = requests.request(“get”,url,proxies=proxy)
3、搜索
kv = {‘wd’:‘Python’}
r = requests.get(url,params=kv)
4、ssl
#在requests做请求的时候,为了避免ssl认证,可以将verify=False, 但是这么设置会带来一个问题,日志中会有大量的warning信息
pic = requests.get(pho[0], headers=headers, verify=True)
3、selenium
地址:https://www.cnblogs.com/miqi1992/p/8093958.html
from selenium import webdriver
#打开Chrome
driver = webdriver.Chrome()
#访问url
driver.get(“http://www.baidu.com”)
#获取页面名为wraper的id标签的文本内容
text = drive.find_element_by_id(“wrapper”).text
#打印数据内容
print(drive.title)
#获取新的页面快照
driver.save_screenshot(“长城.png”)
#找到输入窗口并输入’大熊猫’
drive.find_element_by_id(“kw”).send_keys(u"大熊猫")
#点击搜索
drive.find_element_by_id(‘su’).click()
#获取当前页面Cookie
print(driver.get_cookies())
#ctrl+a全选输入框内容
driver.find_element_by_id(‘kw’).send_keys(Keys.CONTROL, ‘a’)
#模拟Enter回车键
driver.find_element_by_id(‘su’).send_keys(Keys.RETURN)
#清空输入框内容
driver.find_element_by_id(‘kw’).clear()
#获取当前url
print(driver.current_url)
#下滑10000码
s=“var q=document.documentElement.scrollTop=10000”
driver.execute_script(js)
#退出
driver.quit()
4、Scrapy框架
# 创建scrapy
cmd 中 “scrapy startproject e11”
# 在spiders文件夹里创建spider
在spider文件夹中创建
class **Spider(scrapy.Spider):
· name = ‘**’
allowed_domains = [‘careers.tencent.com’]
· start_urls = [‘https://careers.tencent.com/search.html?&start=0#a’]
· def parse(self, response):
· 要执行的东西
继承items里的item
yield 返回
· 在items里创建所需要存放数据的item
class QQItem(scrapy.Item):
name = scrapy.Field()
detailLink = scrapy.Field()
positionInfo = scrapy.Field()
workLocation = scrapy.Field()
· 在pipelines中创建处理爬下来的数据
class QQPipeline(object):
· def process_item(self, item, spider):
with open(‘QQ.json’, ‘a’)as f:
json.dump(dict(item), f, encoding=‘utf-8’, ensure_ascii=False)
return item #必须要
· 在settings中设置优先级
#pipeline
ITEM_PIPELINES = {
‘e16_qq.pipelines.QQPipeline’: 300,
}
#如果设置了middlewares也要设置这个
DOWNLOADER_MIDDLEWARES = {
‘e17_xiaohua.middlewares.XiaohuaDownloaderMiddleware’: 543,
}
· 在middlewares中设置爬取方式
class MeijuDownloaderMiddleware(object):
· def process(self, request, spider):
drive = webdriver.Chrome()
drive.get(request.url)
· html = drive.page_source
time.sleep(1)
drive.quit()
#必须有
return HtmlResponse(url=request.url, body=html, encoding=‘utf-8’, request=request)
· 在spider中 主动关闭爬虫:
self.crawler.engine.close_spider(self, “cookie失效关闭爬虫”)
· 在pipeline 和downloadermiddlewares 主动关闭爬虫:
spider.crawler.engine.close_spider(spider, “全文结束关闭爬虫”)
5、图文识别
import pytesseract as pt
from PIL import Image
#路径
image = Image.open(“1-26.jpg”)
text = pt.image_to_string(image)
print(text)
6、mysql数据库
eg:18_mysqllianjiejianbiao
eg:19_mysqlcharu
eg:20_mysqlchaxun
eg:21_mysqlgengxin
eg:22_mysqldelete
#连接
db = pymysql.connect(host=‘192.168.43.245’, user=‘shiboven’, passwd=‘xqx521’, db=‘mysql’, port=3306)
# 创建游标,对数据进行操作使用cursor()方法
cursor = db.cursor()
# 使用execute()方法创建sql语句
cursor.execute(‘DROP TABLES IF EXISTS JBTLXY’)
# 使用预处理语句创建表
cursor.execute(sql)
db.close()
7、mongoDB数据库
eg:hupu_mongoDB
eg:23_mongoconnect
eg:24_mongoinstall
# 链接数据库
client = pymongo.MongoClient()
# 获取到数据库 链接数据库
db = client.TBTL_tea
# 获取集合
std = db.posts
# 获取数据
datas = std.find()
eg:04_kugou_mongoDB_bs4
方法1
datas = []
data = {
‘href’: href,
‘songer’: songer,
‘song’: song,
‘time’: time,
‘rank’: rank
}
datas.append(data)
client = pymongo.MongoClient()
songs = client.KG_DB.songs
songs_id = songs.insert_many(data)
方法2
paiming = []
mingzi = []
wangzhi = []
zuozhe = []
shijian = []
yuedu = []
pinglun = []
paiming.append(rank)
mingzi.append(title)
wangzhi.append(url)
zuozhe.append(Author)
shijian.append(Time)
yuedu.append(Comment)
pinglun.append(Reply)
items = zip(paiming, mingzi, wangzhi, zuozhe, shijian, yuedu, pinglun)
hupu_post = MongoAPI(db_name=‘new_hupu’, table_name=‘post’)
for item in items:
#add在hupu_mongoDB
hupu_post.add({
‘rank’: item[0],
‘title’: item[1],
‘url’: item[2],
‘Author’: item[3],
‘Time’: item[4],
‘Reply’: item[5],
‘Comment’: item[6],
})
工具:
1、re正则
#查找括号内的东西
s = r’
2、xpath
print(book.xpath(’.//div[@class=“title”]/a[@href]’)[0].attrib[‘href’])
# 同上 取所有字符串
print(book.xpath(’.//div[@class=“title”]/a’)[0].text)
men = response.xpath(’//div[@class=“item_list infinite_scroll”]/div’)
for man in men:
item[‘name’] = man.xpath(’./div/div/a/img/@alt’).extract()
item[‘src’] = man.xpath(’./div/div/a/img/@src’).extract()
item[‘href’] = man.xpath(’./div/div/a/@href’).extract()[0]
地址:https://www.runoob.com/xpath/xpath-tutorial.html
3、Bs4
地址:https://cuiqingcai.com/1319.html
soup = BeautifulSoup(html,‘lxml’)
#soup.prettify() --> 格式化打印出了它的内容。
#html = soup.prettify()
divs = soup.select(“li[class=‘media’]”)
for div in divs:
name = div.select(‘div h3 a’)[0].get_text()
href = div.select(‘div h3 a’)[0].attrs[‘href’]
#直接获取某个关键词下边的内容
#
# 1
#
rank = soup.select(".pc_temp_num")
soup = BeautifulSoup(res.text,‘lxml’)
items = soup.find(‘ul’,{‘class’:‘f-hide’}).find_all(‘a’)
#循环里边
id = item.get(‘href’)
4、pandas存储
eg:03_pandascuncsv
datas = pd.DataFrame({
‘name’:names,
‘id’:ids
})
datas.to_csv(‘movids.csv’)
#打印前五行
print(datas.head())
# 总结type有几个 平均boxoffice是多少 然后排序
# print(datas.groupby(‘type’).agg({‘boxoffice’:[‘count’,‘mean’]}))
print(pd.read_csv(‘kuwo.csv’,encoding=‘gbk’).head())
5、tkinter
eg:07_tkinter_wangyiyun
# 创建播放器
root = Tk()
# 标题
root.title(‘网易云音乐下载器’)
# 设置大小
root.geometry(“700x550”)
root.geometry("+700+80")
# 设置下载器标签:请输入您的下载的地址
lable = Label(root, text=“请输入您下载的地址:”, font=(‘隶书’, 22))
# 定位 pack_olace_grid
lable.grid()
# 设置输入框
entry = Entry(root, font=(‘隶书’, 22), width=25)
entry.grid(row=0, column=1)
# 设置列表框
text = Listbox(root, font=(‘隶书’, 22), width=46, height=14)
text.grid(row=1, columnspan=2)
# 设置按钮 NSWE
button1 = Button(root, text=“开始”, font=(‘微软雅黑’, 25), command=music_spider)
button1.grid(row=2, column=0, sticky=‘s’) # sticky对齐方式
# 退出按钮
button2 = Button(root, text=“退出”, font=(‘微软雅黑’, 25), comman=root.quit)
button2.grid(row=2, column=1, sticky=‘s’) # sticky对齐方式
# 显示窗口,显示消息回环
root.mainloop()
def music_spider():
# 添加数据到空间中
text.insert(END, ‘下载完成:{}’.format(name))
# 文本框向下滚动
text.see(END)
# 更新
text.update()
注:小知识点
1、decode()解码
encode()编码
2、json
#装载数据
json_data = json.loads(data)
#装载json
print(json.loads(req.text))
print(req.json())
dumps是将dict转化成str格式,loads是将str转化成dict格式。
dump和load也是类似的功能,只是与文件操作结合起来了。
#loads和dumps解释:https://www.cnblogs.com/wswang/p/5411826.html
eg:15_jsoncunchu
3、md5
import hashlib
md5 = hashlib.md5()
md5.update(’***’.encode(“utf-8”))
sign = md5.hexdigest()
print(sign)
4、random
#opener_list是一个列表
random.choice(opener_list)
5、os
path = ‘图片/’
if not os.path.exists(path):
#mkdir单个路径 makedirs多个路径
os.mkdir(path)
with open(’{}{}.jpg’.format(path, name), ‘wb’)as f:
f.write(pic.content)
6、csv
with open(‘dmbj.csv’,‘w’,newline=’’) as f:
f_csv = csv.writer(f)
#写一行
f_csv.writerow([‘书名’,‘章节名’,‘时间’,‘网址’])
#写好多行
f_csv.writerows(contents)
7、zip
a = [‘a’,‘b’,‘c’]
b = [1,2,3]
x = dict(zip(a,b))
{‘a’:1,‘b’:2,‘c’:3}