第一章 Python 爬虫学习入门的使用
`
大数据时代,要进行数据分析,首先要有数据源,而学习爬虫,可以让我们获取更多的数据源,并且这些数据源可以按我们的目的进行采集,去掉很多无关数据。
网络爬虫是一个自动提取网页的程序,它为搜索引擎从万维网上下载网页,是搜索引擎的重要组成。传统爬虫从一个或若干初始网页的URL开始,获得初始网页上的URL,在抓取网页的过程中,不断从当前页面上抽取新的URL放入队列,直到满足系统的一定停止条件。
# -*- coding:utf-8 -*-
# 导包,发起请求使用urllib库的request请求模块
import urllib.request
# urlopen()向URL发请求,返回响应对象,注意url必须完整
re=urllib.request.Request('http://www.baidu.com/')
print(re)
response=urllib.request.urlopen('http://www.baidu.com/')
print(response)
#提取响应内容
html = response.read().decode('utf-8')
#打印响应内容
print(html)
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2020/9/12 17:03
# @Author : Joe Wang
# @FileName: 01.py
# @Software: PyCharm
# @Blog :https://blog.csdn.net/wangzhaoyoung
import requests
import json
import os
def music_download():
kw = input("请输入音乐名称:")
# 请求头
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63",
"Cookie":"_ga=GA1.2.1083049585.1590317697; _gid=GA1.2.2053211683.1598526974; _gat=1; Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1597491567,1598094297,1598096480,1598526974; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1598526974; kw_token=HYZQI4KPK3P",
"Referer": "http://www.kuwo.cn/search/list?key=%E5%91%A8%E6%9D%B0%E4%BC%A6",
"csrf": "HYZQI4KPK3P",
}
# 参数列表
params = {
"key": kw,
# 页数
"pn": "1",
# 音乐数
"rn": "10",
"httpsStatus": "1",
"reqId": "cc337fa0-e856-11ea-8e2d-ab61b365fb50",
}
# 创建列表,后面下载需要
music_list = []
url = "http://www.kuwo.cn/api/www/search/searchMusicBykeyWord?"
res = requests.get(url = url,headers = headers,params = params)
res.encoding = "utf-8"
text = res.text
# 转成json数据
json_list = json.loads(text)
# 发现data中list是存主要数据的地方
datapack = json_list["data"]["list"]
# 遍历拿到所需要的数据,音乐名称,歌手,id...
for i in datapack:
# 音乐名
music_name = i["name"]
# 歌手
music_singer = i["artist"]
# 待会需要的id先拿到
rid = i["rid"]
# 随便试听拿到一个音乐的接口,这是的rid就用得上了
api_music = "http://www.kuwo.cn/url?format=mp3&rid={}&response=url&type=convert_url3" \
"&br=128kmp3&from=web&t=1598528574799&httpsStatus=1" \
"&reqId=72259df1-e85a-11ea-a367-b5a64c5660e5".format(rid)
api_res = requests.get(url = api_music)
# 打印发现真实的url确实在里面
# print(api_res.text)
music_url = json.loads(api_res.text)["url"]
# 大功告成,试试效果
print(music_name)
print(music_singer)
print(music_url)
# 把数据存到字典方便下载时查找
music_dict = {}
music_dict["name"] = music_name
music_dict["url"] = music_url
music_dict["singer"] = music_singer
music_list.append(music_dict)
# 看看真实数据数量
print(len(music_list))
# 下载
xiazai = input("输入音乐名称:")
# 下载位置
root = 'E://下载的music//'
for i in range(len(music_list)):
try:
if xiazai == music_list[i]["name"]:
# 创建文件夹
if not os.path.exists(root):
os.mkdir(root)
# 拿到字典中对应的音乐url数据
music_content = requests.get(url = music_list[i]["url"]).content
with open(root + "{}({}).mp3".format(music_list[i]['name'],music_list[i]['singer']),"wb") as f:
f.write(music_content)
print("下载成功")
else:
print("此歌名不在你所搜索的音乐里!")
continue
except:
print("下载失败")
if __name__ == "__main__":
music_download()
from lxml import etree
import requests
re=requests.get('http://lib.haue.edu.cn/index/dzfw.htm')
re.encoding='utf8'
html=etree.HTML(re.text)
# print(html)
h=html.xpath('/html/body/div/div[2]/div[1]/ul/li[2]/a//text()')
l=html.xpath('/html/body/div/div[2]/div[1]/ul/li[2]/a//@href')
print(h)
print(l)
import requests
from lxml import etree
url = "http://www.win4000.com/wallpaper_big_159691.html"
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0',
'cookie':
't=1aa13ce81f7728775dcf2b09de69d452; r=2665; XSR1F-TOKEN=eyJpdiI6IllES2JVSmxBNFFmbldxT0x0QlFJVVE9PSIsInZhbHVlIjoiU1ltbHRWc0Fmb2VDcUVHY2Q1YXhFVytOUjR6alZXUWYreGtqV2Y0THdyd0t6UjlQTElKejJ6ZTh6QzlMRGx1QkQ0d2RFMGVsWm5rYWZJSEFcL251SldqRFwvbDNjS2tZVWt6aThWdUUwZThiZUtHTWp6NDB6ZFdlWXJ0RWgwdWRBbyIsIm1hYyI6ImYwNTlhNTk1NzI3NmY2NjM5MTJlYzQ0MTM2OGJmMmEwOWQ1MDhkYzM0MzE4NDM0NjM0NmVmOWUwZmVhNTRjMGYifQ%3D%3D; win4000_session=eyJpdiI6IlBXR3hjT2dBS0ptZlFUQlVxYXVLeFE9PSIsInZhbHVlIjoibjJVYVk1NTZPeGVPUmJRUFZydmgyVGRVXC9LUVwvZlBBOVluRUZkcVRsckVIeUVybzluSkJ2WWtjbXBtRVNwejNDNjFZMDh1NFJGV291QmxSRkZ2c051bHVPcFpyckVqVWpkbkJEK2Z2S3Q4NlQ1ZSs5cnBaNWVWNlc3M29YbnVzbyIsIm1hYyI6Ijk1MTYwZWZjYzdhNjYzOWIzMTYzZjNmYzYzMGEwYzgzMjJlZjk0ZThmMmEzZjg1YWU3MDE1OTA5Njc1NTkxNjkifQ%3D%3D'}
r = requests.get(url,headers = headers)
print(r)
r.encoding='utf8'
html = etree.HTML(r.text)
print(r.text)
#
for i in range(1,10):
conter = html.xpath('//*[@id="picBox"]/ul/li/a//img[{}]//@src'.format(i))
a = '\n'.join(conter)
root = 'D:'
print(conter)
for i in range(len(conter)):
path = root + conter[i].split('/')[-1]
f = open(path, 'wb')
try:
pic = requests.get(conter[i], timeout=5, headers=headers)
f.write(pic.content)
print('第' + str(i) + '张下载成功!')
except:
print('下载失败')
f.close()
import requests
from lxml import etree
# 目标网址
url = "http://www.netbian.com/"
# 模拟请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55'
}
# 获取请求的后返回的对象
r = requests.get(url,headers=headers)
print(r)
# 设置编码格式
r.encoding='gbk'
# 使用xpath框架进行数据的解析
html = etree.HTML(r.text)
print(html)
# 将我们想要的数据使用xpath框架进行找到地址
conter=html.xpath('//*[@id="main"]//ul//li//a//img//@src') #图片链接
title = html.xpath('//*[@id="main"]//ul//li//a//img//@alt') #获取标题
# /html/body/div[2]/div[2]/div[5]/div[1]/ul/li[4]/a/img
# 使用for循环将我们获取到的图片列表遍历出来
print(conter,title)
# /html/body/div[2]/div[2]/div[5]/div[1]/ul/li[1]/a/img
i=1
for con,t in zip(conter,title):
# 标题里面有空格和特殊字符,需要处理一下
t=t.replace(' ','')
# 输出每一个链接
# print(t,con)
# --------------------------保存图片----------------------------
# 给个绝对地址 str(t)的意思是把我们获取到的标题变成字符串然后 后面加个保存的文件类型jpg
root='E://'+str(t)+'.jpg'
# 打开这个路径,以wb二进制写的方式打开
f=open(root,'wb')
# 为防止报错,我们将错的抛出,正确的继续运行
try:
# 请求突破所在的地址
pic = requests.get(con,headers=headers)
# 将图片以数据流的方式进行写入
f.write(pic.content)
print('下载第{}张成功'.format(i))
i+=1
except:
print('下载失败')
#关闭我们使用的资源
#关闭我们使用的资源
f.close()
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("http://www.baidu.com")
print(browser.page_source)
browser.close()
# -*- coding:utf-8 -*-
"""
作者:[email protected]
用途:爬取搜狗指定的词条对应的搜索结果
日期:年月日
"""
import requests
url='https://www.sogou.com/web'
kw=input("输入你要查找的词条")
params={
"query":kw
}
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
}
response=requests.get(url=url,params=params,headers=header)
response.encoding="utf8"
param_text=response.text
# print(param_text)
filename='kw.html'
with open(filename,'w',encoding="utf8") as fp:
fp.write(param_text)
fp.close()
# -*- coding:utf-8 -*-
"""
作者:[email protected]
用途:破解百度翻译
日期:年月日
"""
import requests
url = "https://fanyi.baidu.com/sug"
data = {
"kw": "cat"
}
resp = requests.post(url, data=data)
data_list = resp.json()['data']
for item in data_list:
print(f"{item['k']}: {item['v']}")
# -*- coding:utf-8 -*-
"""
作者:[email protected]
用途:豆瓣电影爬取
日期:年月日
"""
import requests
import json
url=" https://movie.douban.com/j/chart/top_list"
param={
'type': '24',
'interval_id': '100:90',
'action': '',
'start': '60',
'limit': '20',
}
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
}
re=requests.get(url=url,params=param,headers=header)
list_data=re.json()
fp=open('./ouban.json','w',encoding='utf8')
json.dump(list_data,fp=fp,ensure_ascii=False)
print('Over!!!')
# -*- coding:utf-8 -*-
"""
作者:[email protected]
用途:爬取肯德基餐厅信息案例
日期:年月日
"""
import requests
url="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx"
param={
'op':'keyword'
}
header={
"User-Aged":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29"
}
data={
'cname': '',
'pid': '',
'keyword': '北京',
'pageIndex': '1',
'pageSize': '10',
}
re = requests.post(url=url, data=data,headers=header,params=param)
print(re)
re.encoding='utf8'
list_data=re.text
print(list_data)