爬虫(spider),按照一定规则抓取万维网信息的程序或脚本
百度本质上也是爬虫,将各种信息爬取下来展示
爬虫目的:采集数据
爬虫分类:通用网络爬虫(检索引擎,百度,谷歌必须遵循robots协议),聚焦网络爬虫(针对某个站点或页面写程序爬),增量式子网络爬虫(一种方法),深层网络爬虫(暗网爬,防拦截)
**********
import urllib.request
url="https://www.sina.com.cn/"
response=urllib.request.urlopen(url)
data=response.read()
print(response)
print(data)
print(type(data))
*********
data是字节数据
fiddler 抓包
********
POST / HTTP/1.1
Host: r3.o.lencr.org :网站
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0 : 请求头,用户代理对象
Accept: */*
Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
Accept-Encoding: gzip, deflate : 期望编码
Content-Type: application/ocsp-request
Content-Length: 85
Connection: keep-alive
Pragma: no-cache
Cache-Control: no-cache
********
********
GET / HTTP/1.1
Accept-Encoding: identity
Host: www.sina.com.cn
User-Agent: Python-urllib/3.7
Connection: close
*********
反反爬
1.请求头伪造
2.联系多次采集,不一次爬完,模拟人的行为time.sleep(random.randint(1,5))
3.IP地址代理(收费)
1请求头伪造
*******
import urllib.request
url="https://www.baidu.com/"
header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0"
}
rep=urllib.request.Request(url=url,headers=header)
response=urllib.request.urlopen(rep)
data=response.read()
html=data.decode("utf-8")
with open("baidu2.html","w",encoding="utf-8") as f:
f.write(html)
print("爬取成功")
print(html)
*******
短时间访问过多也会被识别为爬虫
多伪造几个请求头对象,随机使用爬取,避免被反爬
*********
import urllib.request
import random
url="http://www.baidu.com/"
us=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
header={
"User-Agent": random.choice(us)
}
rep=urllib.request.Request(url=url,headers=header)
response=urllib.request.urlopen(rep)
data=response.read()
html=data.decode("utf-8")
with open("baidu3.html","w",encoding="utf-8") as f:
f.write(html)
print("爬取成功")
print(html)
***********
******
from urllib import request
import random
us=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
"User-Agent":random.choice(us)
}
url = "https://www.qq.com/"
rep = request.Request(url=url,headers=headers)
response=request.urlopen(rep)
data=response.read()
html=data.decode("gb2312",errors="ignore")
print(html)
with open("qq2.html","w",encoding="gb2312") as f:
f.write(html)
print("爬取成功")
******
转换字符后乱码
推荐先转换二进制数据,再转换字符串
或者:
先安装chardet库
*********
from urllib import request
import random
import chardet
us=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
"User-Agent":random.choice(us)
}
url = "https://www.qq.com/"
rep = request.Request(url=url,headers=headers)
response=request.urlopen(rep)
data=response.read()
res = chardet.detect(data)
charset=res.get("encoding")
print(res)
print(charset)
html=data.decode(charset,errors="ignore")
print(html)
with open("qq2.html","w",encoding=charset) as f:
f.write(html)
print("爬取成功")
********
*******
from urllib import request
import random
import chardet
us=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
"User-Agent":random.choice(us)
}
url = "https://www.qq.com/"
rep = request.Request(url=url,headers=headers)
response=request.urlopen(rep)
data=response.read()
res = chardet.detect(data)
charset=res.get("encoding")
print(res)
print(charset)
html=data.decode(charset)
print(html)
with open("qq2.html","w",encoding=charset) as f:
f.write(html)
print("爬取成功")
********
get和post请求
get用于向服务器获取数据
*******
from urllib import request
from urllib import parse
import random
us=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
"User-Agent":random.choice(us)
}
url="https://www.baidu.com/s?"
wd =input("请输入你想要查询的内容:")
params = {
"wd":wd
}
ps = parse.urlencode(params)
print(ps)
url=url+ps
print(url)
rep=request.Request(url=url,headers=headers)
resp=request.urlopen(rep)
data=resp.read()
print(data)
with open("get2.html","wb") as f:
f.write(data)
print("success")
*******
post请求
找到有道翻译translate请求
有效载荷
****
{
"i": "管理",
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "16684393659218",
"sign": "7493112ec630b4fac9021bad4fda90b7",
"lts": "1668439365921",
"bv": "e2a78ed30c66e16a857c5b6486a1d326",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTlME"
}
*****
*****
from urllib import request
import urllib
url="https://fanyi.youdao.com/translate?"
header={
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0"
}
word = input("请输入你需要翻译的词条: ")
from_data={
"i": word,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "16684393659218",
"sign": "7493112ec630b4fac9021bad4fda90b7",
"lts": "1668439365921",
"bv": "e2a78ed30c66e16a857c5b6486a1d326",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTlME"
}
data=urllib.parse.urlencode(from_data)
data = data.encode("utf-8")
req=request.Request(url=url,data=data,headers=header)
response=request.urlopen(req)
html=response.read().decode(encoding="utf-8").strip()
print(html)
请输入你需要翻译的词条: 第三发
{"type":"ZH_CN2EN","errorCode":0,"elapsedTime":5,"translateResult":[[{"src":"第三发","tgt":"A third"}]]}
******
requests库
底层是封装了urllib 库
自己试试里面方法
按住ctrl 点方法 查看
*****
*****
******
import re
images=re.findall(r"src=\"(.*?jpg|png|gif|jepg)\"",html)
for index,item in enumerate(images):
if not item.startswith("http"):
real_url = "http" + item
else
real_url=item
resp = requests.get(real_url)
with open("image/"+str(index)+".jpg","wb") as f
f.write(resp.content)
******
post请求
request.past()
请求头库
fake_useragent
from fake_ueragent import UserAgent
ua=UserAgent()
print(ua)
*****
from fake_useragent import UserAgent
ua=UserAgent()
print(ua)
print(ua.ie)
print(ua.chrome)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36
******
随机请求头
******
from fake_useragent import UserAgent
ua=UserAgent()
print(ua.random)
******
post请求
******
from fake_useragent import UserAgent
import requests
url="https://fanyi.youdao.com/translate?"
header={
"User-Agent" : UserAgent().random
}
word = input("请输入你需要翻译的词条: ")
from_data={
"i": word,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "16684393659218",
"sign": "7493112ec630b4fac9021bad4fda90b7",
"lts": "1668439365921",
"bv": "e2a78ed30c66e16a857c5b6486a1d326",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTlME"
}
response = requests.post(url=url,data=from_data,headers=header)
print(response.text)
print(type(response.text))
print(response.next)
******
post data参数
如果加密则需要在js文件中寻找模拟
xpath
一般使用爬虫爬取数据后,需要筛选数据,使用正则,解析数据的方法xpath
xpath是xmlpath缩写,xml是路径语言
xml---可扩展标记语言
html----超文本标记语言
xpath使用规则:
nodename --- 选取该节点所有子节点
/ 根节点
// 从当前节点选取子孙节点
. 选取当前节点
.. 父接单
@ 选取属性
xpath是独立操作xml语言的,任何语言中都可以使用
如果要使用xpath,需要lxml库
链接数据库
pip install mysql-connector-python --allow-external mysql-connector-python
pip install mysql-connector python -m pip insatll -U pip mysql-connector
import mysql.connector
*****
from mysql import connector as co
conn=co.connect(user='root',password='root',database='test')
cursor=conn.cursor()
cursor.execute('create table user (id varchar(20) primary key,name varchar(20))')
cursor.execute('insert into user (id,name) values (%s,%s)',['1','Michael'])
cursor.rowcount
>1
cursor.commit()
cursor.close()
cursor.execute('select * from user where id = %s',('1',))
values=cursor.fetchall
values
>[('1','michel)]
cursor.close()
conn.close()
*****
pychram 使用数据库
pymysql ---python
****
import pymysql
pymysql.connect(
host="localhost",
user="root",
password="root"
db="test"
charset="utf8"
port=3306
)
cursor = conn.sursor()
cursor.close()
conn.close()
****