import requests
url = "http://www.httpbin.org/headers"
res = requests.get(url).text
print(res)
由结果可知,python代码发出的请求user-agent为python-request
自己访问该网页时
自己手动添加headers
import requests
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
}#使用浏览器访问的User-Agent
url = "http://www.httpbin.org/headers"
res = requests.get(url,headers=headers).text
print(res)
import requests
cookie={
"Cookie": "_uuid=EEC25884-F044-5E17-FBAC-414E3EA1540283053infoc ",
}#等号前为键,等号后为值
url = "http://www.httpbin.org/cookies"
res = requests.get(url,cookies=cookie).text
print(res)
对timeout赋值,值为秒数,如果超过则报错,所以配合try except使用
import requests
url = "http://www.httpbin.org/ip"
ip = {"http":"http://192.168.2.123:8080"}
try:
res = requests.get(url,proxies=ip,timeout=3).text
print(res)
except:
pass
requests代理格式:{“协议类型”:“协议类型://ip:端口号”}{“http”:“http://127.0.0.1:80”}
访问http://www.httpbin.org/ip可以看到本地主机对公网的ip,如果代理可用则ip改变
import requests
url = "http://www.httpbin.org/ip"
ip = {"http":"http://192.168.2.123:8080"}
res = requests.get(url,proxies=ip).text
print(res)
import requests
from lxml import etree
page = 0
result = requests.get("http://www.httpbin.org/ip").text
while True:
url = "https://www.7yip.cn/free/?action=china&page="+str(page)
#本机ip,与此不同则为可用ip
page +=1
res = requests.get(url).text
e = etree.HTML(res)
types = e.xpath("//td[@data-title='类型']/text()")
ips = e.xpath("//td[@data-title='IP']/text()")
port = e.xpath("//td[@data-title='PORT']/text()")
proxy = { }
for i in range(len(types)):
#从0到types的长度拿到每一个下标
proxy[types[i].lower()] = types[i].lower()+"://"+ips[i]+":"+port[i]
#键的形式存http与https,值把ip与端口拼接
print(proxy)
try:
res2 = requests.get("http://www.httpbin.org/ip",proxies=proxy,timeout=3).text
#用代理ip发请求
if result not in res2:
#本机ip是否在res2发出请求返回的ip中
print(proxy,"代理可用")
proxy={}
#如果不重置为空字典会向字典中插入新值,一行会显示两个代理
except:
proxy={}
制作IP相关的操作,和爬虫业务不相关
import requests
import pymysql
from lxml import etree
conn = pymysql.Connect(
db = 'spider',
host = 'localhost',
port = 3306,
user = 'root',
password= '123',
charset="utf8"
)
cursor = conn.cursor()
check_url = "http://www.httpbin.org/ip"
my_ip = requests.get(check_url).text
class DB():
def save(self,ip):
sql = "insert into ips values (%s)"
cursor.execute(sql,(ip,))
#mysqldb需要提交元组
conn.commit()
def get(self):
sql = "select * from ips"
cursor.execute(sql)
for i in cursor.fetchall():
#取到所有条数据,返回元组
return i[0]
def delete_ip(self,ip):
sql = "delete from ips where ip = %s"
cursor.execute(sql,(ip,))
conn.commit()
def get_count(self):
sql="select * from ips"
cursor.execute(sql)
return len(cursor.fetchall())
#fetchall返回元组,元组长度即为数量
def get_all(self):
sql = "select * from ips"
cursor.execute(sql)
return cursor.fetchall()
class Pool():
def __init__(self,limit):
self.limit=limit
#阈值大小
self.db = DB()
#实例化数据库对象
def check_ip(self,ip):
try:
# 许多代理不可用所以用try处理
target_ip = requests.get(check_url,proxies=eval(ip),timeout=3).text
if target_ip != my_ip:
return True
except:
pass
return False
def crawler(self):
page = 1
if self.limit >self.db.get_count():
#如果容器中的ip数量小于阈值,就采集
while True:
url = "http://qinghuadaili.com/free/%s/"%str(page)
page += 1
html = etree.HTML(requests.get(url).text)
ip = html.xpath("//tr/td[1]/text()")
port = html.xpath("//tr/td[2]/text()")
type = html.xpath("//tr/td[4]/text()")
proxy = { }
for i in range(len(type)):
proxy[type[i].lower()] = type[i].lower()+"://"+ip[i]+":"+port[i]
if self.check_ip(proxy):
print("可用ip",proxy)
self.db.save(str(proxy))
def get_ip(self):
ip = self.db.get()
self.db.delete_ip()
#取出ip后从数据库中删除
return ip
def check_all(self):
#检测所有ip的可用性
ips = self.db.get_all()
for ip in ips:
if not self.check_ip(ip[0]):
self.db.delete_ip(ip[0])
if self.db.get_count() < self.limit:
self.crawler()
pool = Pool(200)
pool.crawler()
抓取思路:扩散
验证码:第三方平台,人工智能识别
import requests
import pymysql
from lxml import etree
from chaojiying import get_code
conn = pymysql.Connect(
db = 'spider',
host = 'localhost',
port = 3306,
user = 'root',
password= '123',
charset="utf8"
)
cursor = conn.cursor()
cookies = {
"t":"6c384ab6618045d6fb2691f06c37d4c14",
}
#cookies中只有某个或某几个负责登录状态,人人网是键为t的值
# #人人网判断登录状态,否则无法查看
def view_detials(url):
res = requests.get("http://www.renren.com/"+url+"/profile",cookies=cookies).text
ele = etree.HTML(res)
try:
name = ele.xpath("//title/text()")[0]
print(name)
save_data(name)
# name为列表
if name != "人人网 - 验证码":
img_url = ele.xpath("//div[@id='footprint-box']//li/a/@namecard")
# 从最近访客中开始扩散,扩散需要的url为namecard中的数字
save_url(img_url)
# 提交一个列表的url
else:
img_url = ele.xpath("//div[@class='optional']/img/@src")[0]
# 取到验证码的图片的url
img = requests.get(img_url, cookies=cookies).content
# get_code函数只能处理图片的二进制流
check_code(get_code(img))
# 调用第三方平台对验证码的处理函数对图片进行处理
# get_code的返回值为字典,键['pic_str']的值为验证码的字符串形式
except:
pass
def check_code(code):
check_url = "http://www.renren.com/validateuser.do"
#人人网验证码的url
get_code = code['pic_str']
data = {
'id': '880792860',
'icode': get_code,
'submit': '继续浏览',
'requestToken': '637455402',
'_rtk': 'f1bfca84'
}
requests.post(check_url,data=data,cookies=cookies)
def get_url():
sql = "select url from renren where status = %s"
#数据库中的url为varchar
cursor.execute(sql,("0",))
#数据库中的status为varchar
url = cursor.fetchone()[0]
new_sql = "update renren set status =1 where url = %s"
cursor.execute(new_sql,(url,))
conn.commit()
view_detials(url)
def save_url(urls):
for url in urls:
try:
#会出现验证码,用try处理
#在viewdetails中获得的url列表中遍历
sql = "insert into renren values (%s,%s)"
cursor.execute(sql,(url,"0"))
#将新插入的人的数据状态码为0
conn.commit()
except:
pass
def save_data(data):
sql = "insert into renren_data values (%s)"
#插入语句的%s需要加括号
cursor.execute(sql,(data,))
conn.commit()
if __name__ == '__main__':
while True:
get_url()
def get_code(im):
chaojiying = Chaojiying_Client('用户名','密码', '软件id')
return chaojiying.PostPic(im, 1902)