python 爬虫基础笔记

爬虫(spider),按照一定规则抓取万维网信息的程序或脚本

百度本质上也是爬虫,将各种信息爬取下来展示

爬虫目的:采集数据
爬虫分类:通用网络爬虫(检索引擎,百度,谷歌必须遵循robots协议),聚焦网络爬虫(针对某个站点或页面写程序爬),增量式子网络爬虫(一种方法),深层网络爬虫(暗网爬,防拦截)
**********
import urllib.request
url="https://www.sina.com.cn/"
response=urllib.request.urlopen(url)
data=response.read()
print(response)
print(data)
print(type(data))
*********
data是字节数据


fiddler 抓包
********
POST / HTTP/1.1      
Host: r3.o.lencr.org    :网站
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0 : 请求头,用户代理对象
Accept: */*
Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
Accept-Encoding: gzip, deflate    : 期望编码
Content-Type: application/ocsp-request    
Content-Length: 85
Connection: keep-alive
Pragma: no-cache
Cache-Control: no-cache

********

********
GET / HTTP/1.1
Accept-Encoding: identity #期望编码
Host: www.sina.com.cn
User-Agent: Python-urllib/3.7
Connection: close
*********

反反爬
1.请求头伪造
2.联系多次采集,不一次爬完,模拟人的行为time.sleep(random.randint(1,5))

3.IP地址代理(收费)

1请求头伪造
*******
import urllib.request
url="https://www.baidu.com/"
header={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0"
}
#创建请求头的对象那个
rep=urllib.request.Request(url=url,headers=header)
response=urllib.request.urlopen(rep)
data=response.read()
html=data.decode("utf-8")
with open("baidu2.html","w",encoding="utf-8") as f:
    f.write(html)
    print("爬取成功")
print(html)
*******
短时间访问过多也会被识别为爬虫

多伪造几个请求头对象,随机使用爬取,避免被反爬

*********
import urllib.request
import random
url="http://www.baidu.com/"
us=[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
header={
    "User-Agent": random.choice(us)
}
#创建请求头的对象那个
rep=urllib.request.Request(url=url,headers=header)
response=urllib.request.urlopen(rep)
data=response.read()
html=data.decode("utf-8")
with open("baidu3.html","w",encoding="utf-8") as f:
    f.write(html)
    print("爬取成功")
print(html)
***********

******
from urllib import request
import random
us=[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
    "User-Agent":random.choice(us)
}
url = "https://www.qq.com/"
rep = request.Request(url=url,headers=headers)
response=request.urlopen(rep)
data=response.read()
html=data.decode("gb2312",errors="ignore")
print(html)
with open("qq2.html","w",encoding="gb2312") as f:
    f.write(html)
    print("爬取成功")
******
转换字符后乱码
推荐先转换二进制数据,再转换字符串

或者:
先安装chardet库
*********
from urllib import request
import random
import chardet
us=[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
    "User-Agent":random.choice(us)
}
url = "https://www.qq.com/"
rep = request.Request(url=url,headers=headers)
response=request.urlopen(rep)
data=response.read()

#先判断默认编码是什么
#返回的是一个字典
res = chardet.detect(data)
# print(res)
charset=res.get("encoding")
print(res)
print(charset)
html=data.decode(charset,errors="ignore")
print(html)
with open("qq2.html","w",encoding=charset) as f:
    f.write(html)
    print("爬取成功")
********

*******
from urllib import request
import random
import chardet
us=[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
    "User-Agent":random.choice(us)
}
url = "https://www.qq.com/"
rep = request.Request(url=url,headers=headers)
response=request.urlopen(rep)
data=response.read()

#先判断默认编码是什么
#返回的是一个字典
res = chardet.detect(data)
# print(res)
charset=res.get("encoding")
print(res)
print(charset)
html=data.decode(charset)
print(html)
with open("qq2.html","w",encoding=charset) as f:
    f.write(html)
    print("爬取成功")

********

get和post请求

get用于向服务器获取数据
*******
from urllib import request
from urllib import parse
import random
us=[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers = {
    "User-Agent":random.choice(us)
}
url="https://www.baidu.com/s?"
wd =input("请输入你想要查询的内容:")
params = {
    "wd":wd
}
ps = parse.urlencode(params)
print(ps)
url=url+ps
print(url)
rep=request.Request(url=url,headers=headers)
resp=request.urlopen(rep)
data=resp.read()
print(data)
with open("get2.html","wb") as f:
    f.write(data)
    print("success")
*******

post请求

找到有道翻译translate请求
有效载荷
****
{
	"i": "管理",
	"from": "AUTO",
	"to": "AUTO",
	"smartresult": "dict",
	"client": "fanyideskweb",
	"salt": "16684393659218",
	"sign": "7493112ec630b4fac9021bad4fda90b7",
	"lts": "1668439365921",
	"bv": "e2a78ed30c66e16a857c5b6486a1d326",
	"doctype": "json",
	"version": "2.1",
	"keyfrom": "fanyi.web",
	"action": "FY_BY_REALTlME"
}
*****

*****
from urllib import request
import urllib
url="https://fanyi.youdao.com/translate?"
header={
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0"
}
word = input("请输入你需要翻译的词条: ")
from_data={
	"i": word,
	"from": "AUTO",
	"to": "AUTO",
	"smartresult": "dict",
	"client": "fanyideskweb",
	"salt": "16684393659218",
	"sign": "7493112ec630b4fac9021bad4fda90b7",
	"lts": "1668439365921",
	"bv": "e2a78ed30c66e16a857c5b6486a1d326",
	"doctype": "json",
	"version": "2.1",
	"keyfrom": "fanyi.web",
	"action": "FY_BY_REALTlME"
}
data=urllib.parse.urlencode(from_data)
#将str转换byte0

data = data.encode("utf-8")
req=request.Request(url=url,data=data,headers=header)
response=request.urlopen(req)
html=response.read().decode(encoding="utf-8").strip()
print(html)

请输入你需要翻译的词条: 第三发
{"type":"ZH_CN2EN","errorCode":0,"elapsedTime":5,"translateResult":[[{"src":"第三发","tgt":"A third"}]]}
******


requests库
底层是封装了urllib 库

自己试试里面方法
按住ctrl 点方法 查看

*****

*****

******
import  re
images=re.findall(r"src=\"(.*?jpg|png|gif|jepg)\"",html)
for index,item in enumerate(images):
	if not item.startswith("http"):
		real_url = "http" + item
	else
		real_url=item
	resp = requests.get(real_url)
	with open("image/"+str(index)+".jpg","wb") as f
	f.write(resp.content)
******

post请求
request.past()

请求头库
fake_useragent

from fake_ueragent import UserAgent
ua=UserAgent()
print(ua)
*****
from fake_useragent import UserAgent
ua=UserAgent()
print(ua)
print(ua.ie)
print(ua.chrome)

Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)
Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36

******

随机请求头
******
from fake_useragent import UserAgent
ua=UserAgent()
print(ua.random)
******

post请求
******
from fake_useragent import UserAgent
import requests
url="https://fanyi.youdao.com/translate?"
header={
    "User-Agent" : UserAgent().random
}
word = input("请输入你需要翻译的词条: ")
from_data={
	"i": word,
	"from": "AUTO",
	"to": "AUTO",
	"smartresult": "dict",
	"client": "fanyideskweb",
	"salt": "16684393659218",
	"sign": "7493112ec630b4fac9021bad4fda90b7",
	"lts": "1668439365921",
	"bv": "e2a78ed30c66e16a857c5b6486a1d326",
	"doctype": "json",
	"version": "2.1",
	"keyfrom": "fanyi.web",
	"action": "FY_BY_REALTlME"
}
response = requests.post(url=url,data=from_data,headers=header)
print(response.text)
print(type(response.text))
print(response.next)
******

post data参数
如果加密则需要在js文件中寻找模拟

xpath
一般使用爬虫爬取数据后,需要筛选数据,使用正则,解析数据的方法xpath
xpath是xmlpath缩写,xml是路径语言

xml---可扩展标记语言

html----超文本标记语言

xpath使用规则:
nodename --- 选取该节点所有子节点
/  根节点
// 从当前节点选取子孙节点
. 选取当前节点
.. 父接单
@ 选取属性
xpath是独立操作xml语言的,任何语言中都可以使用
如果要使用xpath,需要lxml库

链接数据库
pip install mysql-connector-python   --allow-external mysql-connector-python

pip install mysql-connector python -m pip insatll -U pip mysql-connector

import mysql.connector

*****
from mysql import connector as co
conn=co.connect(user='root',password='root',database='test')
cursor=conn.cursor()
cursor.execute('create table user (id varchar(20) primary key,name varchar(20))')
cursor.execute('insert into user (id,name) values (%s,%s)',['1','Michael'])
cursor.rowcount
>1
#提交事务
cursor.commit()
cursor.close()
#获得数据库
cursor.execute('select * from user where id = %s',('1',))
values=cursor.fetchall
values
>[('1','michel)]
#关闭cursor和connection
cursor.close()
conn.close()
*****

pychram 使用数据库
pymysql ---python

****
import pymysql
pymysql.connect(
    host="localhost",
    user="root",
    password="root"
    db="test"
    charset="utf8"
    port=3306
)
#创建游标
cursor = conn.sursor()
cursor.close()
conn.close()
****

你可能感兴趣的:(爬虫,python,开发语言)